Load libraries and data

easypackages::libraries("here","ggplot2","caret","e1071","pheatmap","reshape2","NbClust","grid","patchwork","readxl","patchwork","WGCNA","psych","nlme","reshape2")
source(here("code","ndar_functions.R"))
source(here("code","euaims_functions.R"))
source(here("code","get_ggColorHue.R"))
source(here("code","cohens_d.R"))
source(here("code","Repfunctionspack6.R"))


options(stringsAsFactors = FALSE)
fontSize = 20
nperm=1

codepath = here("code")
datapath = here("data")
figpath = here("figures")
resultpath = here("results","ndar")
plotpath = here("plots","ndar")

# function to make subtype
make_subtype <- function(data2use, z_thresh, mean2use=NULL, sd2use=NULL){
  # compute difference score
  vars2use = c("dbaes_atotal","dbaes_btotal")
  diff_score = data2use[,vars2use[1]] - data2use[,vars2use[2]]

  # compute mean and sd if necessary
  if (is.null(mean2use)){
    mean2use = mean(diff_score)
  } # if (is.null(mean2use))

  if (is.null(sd2use)){
    sd2use = sd(diff_score)
  } # if (is.null(sd2use))

  # compute z-score
  data2use$z_ds = (diff_score - mean2use)/sd2use

  # make subtype factor
  data2use$z_ds_group = "SC_equal_RRB"
  data2use$z_ds_group[data2use$z_ds>z_thresh] = "SC_over_RRB"
  data2use$z_ds_group[data2use$z_ds<(z_thresh*-1)] = "RRB_over_SC"
  data2use$z_ds_group = factor(data2use$z_ds_group)
  return(data2use)

} # function make_subtype

# read in data
Dverbal_Discovery = read.csv(file.path(datapath,"tidy_verbal_disc.csv"))
Dverbal_Replication = read.csv(file.path(datapath,"tidy_verbal_rep.csv"))
vars2use = c("dbaes_atotal","dbaes_btotal")
rownames(Dverbal_Discovery) = Dverbal_Discovery$subjectkey
rownames(Dverbal_Replication) = Dverbal_Replication$subjectkey

#------------------------------------------------------------------------------
# add in ADOS
ados_Discovery = read.csv(file.path(datapath,"ndar_ados_css_discovery.csv"))
ados_Replication = read.csv(file.path(datapath,"ndar_ados_css_replication.csv"))
ados_Discovery$ados_age = ados_Discovery$interview_age
ados_Replication$ados_age = ados_Replication$interview_age

Dverbal_Discovery$ados_age = NA
Dverbal_Discovery$ados_sa_css = NA
Dverbal_Discovery$ados_rrb_css = NA
Dverbal_Replication$ados_age = NA
Dverbal_Replication$ados_sa_css = NA
Dverbal_Replication$ados_rrb_css = NA
# mask = is.element(Dverbal_Discovery$subjectkey,ados_Discovery$subjectkey)
Dverbal_Discovery[ados_Discovery$subjectkey,"ados_age"] = ados_Discovery$interview_age
Dverbal_Discovery[ados_Discovery$subjectkey,"ados_sa_css"] = ados_Discovery$ados_sa_css
Dverbal_Discovery[ados_Discovery$subjectkey,"ados_rrb_css"] = ados_Discovery$ados_rrb_css
Dverbal_Replication[ados_Replication$subjectkey,"ados_age"] = ados_Replication$interview_age
Dverbal_Replication[ados_Replication$subjectkey,"ados_sa_css"] = ados_Replication$ados_sa_css
Dverbal_Replication[ados_Replication$subjectkey,"ados_rrb_css"] = ados_Replication$ados_rrb_css

#------------------------------------------------------------------------------
# add in IQ
iq_Discovery = read.csv(file.path(datapath,"ndar_iq_discovery.csv"))
iq_Replication = read.csv(file.path(datapath,"ndar_iq_replication.csv"))
iq_Discovery$iq_age = iq_Discovery$interview_age
iq_Replication$iq_age = iq_Replication$interview_age

Dverbal_Discovery$iq_age = NA
Dverbal_Discovery$iq = NA
Dverbal_Replication$iq_age = NA
Dverbal_Replication$iq = NA
# mask = is.element(Dverbal_Discovery$subjectkey,ados_Discovery$subjectkey)
Dverbal_Discovery[iq_Discovery$subjectkey,"iq_age"] = iq_Discovery$iq_age
Dverbal_Discovery[iq_Discovery$subjectkey,"iq"] = iq_Discovery$IQ
Dverbal_Replication[iq_Replication$subjectkey,"iq_age"] = iq_Replication$iq_age
Dverbal_Replication[iq_Replication$subjectkey,"iq"] = iq_Replication$IQ

Hierarchical clustering

#------------------------------------------------------------------------------
# Subtype using hierarchical clustering and dynamic hybrid tree cut algorithm to find the subtypes

# NDAR Discovery --------------------------------------------------------------

# deep split parameter
dS = 0
maxScores = c(3,4)

data2use = Dverbal_Discovery[,vars2use]

# discReorderedItems = colnames(data2use)
fname2save = file.path(plotpath,
                       sprintf("clustergram_ADIalgoTotals_verbalDiscovery_euclidean_ward_deepSplit%d.pdf",dS))
verbalDiscovery_clustResults = ClusterData(data2use,
                                           deepSplit=dS,
                                           fname2save = fname2save)
##  ..cutHeight not given, setting it to 43.9  ===>  99% of the (truncated) height range in dendro.
##  ..done.
oldColors = c("blue","brown","green","red","turquoise","yellow")
newColors = c("5","4","6","3","2","1")

verbalDiscovery_clustResults = relabelClusters(verbalDiscovery_clustResults, oldColors, newColors)
makeClustergram(verbalDiscovery_clustResults, fname2save = fname2save)
## quartz_off_screen 
##                 2
# make plot with all individuals shown as lines
df2use = Dverbal_Discovery[,c("subjectkey",vars2use)]
df2use$subgrp = factor(verbalDiscovery_clustResults$dynamicColors)
df2use = data.frame(df2use)
df2use$subjectkey = factor(df2use$subjectkey)
# df2use$dbaes_atotal = df2use$dbaes_atotal
# df2use$dbaes_btotal = df2use$dbaes_btotal
df2use$SC = df2use$dbaes_atotal
df2use$RRB = df2use$dbaes_btotal
df4plot = melt(df2use,
               id.vars = c("subjectkey","subgrp"),
               measure.vars = c("SC","RRB"))

colors2use = get_ggColorHue(7)
colors2use = colors2use[1:6]
p = ggplot(data = df4plot, aes(x = variable,
                               y = value,
                               colour = subgrp,
                               group = subjectkey)) + facet_grid(. ~ subgrp)
p = p + geom_point(shape=1) + geom_line(alpha = 0.2) + guides(color=FALSE) + ylim(0,1)
p = p + scale_colour_manual(values = colors2use)
p = p + ylab("Percent Severity") + xlab("ADI-R subscale")
fname2save = file.path(plotpath,
                       sprintf("summaryPlot_IndividualSubs_ADIalgoTotals_verbalDiscovery_euclidean_ward_deepSplit%d.pdf",dS))
ggsave(filename = fname2save)
p

# scatterplot
p_disc = ggplot(data = df2use, aes(x = SC, y = RRB, colour = factor(subgrp))) + geom_point() + xlab("Social-Communication") + ylab("Restricted Repetitive Behaviors") + ggtitle("NDAR Discovery") + ylim(0,1) + xlim(0,1) + scale_colour_manual(values = colors2use)
p_disc

# NDAR Replication ------------------------------------------------------------

# deep split parameter
dS = 0
maxScores = c(3,4)

data2use = Dverbal_Replication[,vars2use]

# discReorderedItems = colnames(data2use)
fname2save = file.path(plotpath,
                       sprintf("clustergram_ADIalgoTotals_verbalReplication_euclidean_ward_deepSplit%d.pdf",dS))
verbalReplication_clustResults = ClusterData(data2use,
                                             deepSplit=dS,
                                             fname2save = fname2save)
##  ..cutHeight not given, setting it to 42.9  ===>  99% of the (truncated) height range in dendro.
##  ..done.
oldColors = c("black","blue","brown","green","red","turquoise","yellow")
newColors = c("7","1","4","3","2","6","5")

verbalReplication_clustResults = relabelClusters(verbalReplication_clustResults, oldColors, newColors)
makeClustergram(verbalReplication_clustResults, fname2save = fname2save)
## quartz_off_screen 
##                 2
# make plot with all individuals shown as lines
df2use = Dverbal_Replication[,c("subjectkey",vars2use)]
df2use$subgrp = factor(verbalReplication_clustResults$dynamicColors)
df2use = data.frame(df2use)
df2use$subjectkey = factor(df2use$subjectkey)
# df2use$dbaes_atotal = df2use$dbaes_atotal
# df2use$dbaes_btotal = df2use$dbaes_btotal
df2use$SC = df2use$dbaes_atotal
df2use$RRB = df2use$dbaes_btotal

df4plot = melt(df2use,
               id.vars = c("subjectkey","subgrp"),
               measure.vars = c("SC","RRB"))

p = ggplot(data = df4plot, aes(x = variable,
                               y = value,
                               colour = subgrp,
                               group = subjectkey)) + facet_grid(. ~ subgrp)
p = p + geom_point(shape=1) + geom_line(alpha = 0.2) + guides(color=FALSE) + ylim(0,1)
# p = p + scale_colour_manual(values = colors2use)
p = p + ylab("Percent Severity") + xlab("ADI-R subscale")
fname2save = file.path(plotpath,
                       sprintf("summaryPlot_IndividualSubs_ADIalgoTotals_verbalReplication_euclidean_ward_deepSplit%d.pdf",dS))
ggsave(filename = fname2save)
p

# scatterplot
p_rep = ggplot(data = df2use, aes(x = SC, y = RRB, colour = factor(subgrp))) + geom_point() + xlab("Social-Communication") + ylab("Restricted Repetitive Behaviors") + ggtitle("NDAR Replication") + ylim(0,1) + xlim(0,1)
p_rep

# Subtype using hierarchical agglomerative clustering, looking for k=3

# Will use SC, RRB and the difference score as features

# how many clusters do you want?
nclusters = 3

# cluster with SC, RRB, and difference score
ds = Dverbal_Discovery[,vars2use[1]] - Dverbal_Discovery[,vars2use[2]]
# distance matrix
distmat = dist(x = cbind(Dverbal_Discovery[,vars2use], ds), method="euclidean")
# hierarchical clustering
disc_tree = hclust(d=distmat, method="ward.D2")
# cut the tree
treecut_res = cutree(tree=disc_tree, k=nclusters)
Dverbal_Discovery$hc_subgrp = treecut_res

ds = Dverbal_Replication[,vars2use[1]] - Dverbal_Replication[,vars2use[2]]
# distance matrix
distmat = dist(x = cbind(Dverbal_Replication[,vars2use], ds), method="euclidean")
# hierarchical clustering
disc_tree = hclust(d=distmat, method="ward.D2")
# cut the tree
treecut_res = cutree(tree=disc_tree, k=nclusters)
Dverbal_Replication$hc_subgrp = treecut_res

# Plot Discovery dataset after using hierarchical agglomerative clustering

# Discovery set
maxScores = c(3,4)

# Discovery - make plot with all individuals shown as lines
df2use = Dverbal_Discovery[,c("subjectkey",adi_total_vars2use)]
df2use$subgrp = factor(Dverbal_Discovery$hc_subgrp)
df2use = data.frame(df2use)
df2use$subjectkey = factor(df2use$subjectkey)
df2use$SC = df2use$dbaes_atotal
df2use$RRB = df2use$dbaes_btotal

df4plot = melt(df2use,
               id.vars = c("subjectkey","subgrp"),
               measure.vars = c("SC","RRB"))

p = ggplot(data = df4plot, aes(x = variable,
                               y = value,
                               colour = subgrp,
                               group = subjectkey)) + facet_grid(. ~ subgrp)
p = p + geom_point(shape=1) + geom_line(alpha = 0.2) + guides(color=FALSE)
p = p + ylab("Percent Severity") + xlab("ADI-R subscale")
p

# scatterplot
p_disc = ggplot(data = Dverbal_Discovery, aes(x = dbaes_atotal, y = dbaes_btotal, colour = factor(hc_subgrp))) + geom_point() + xlab("Social-Communication") + ylab("Restricted Repetitive Behaviors") + ggtitle("NDAR Discovery")
p_disc

table(Dverbal_Discovery$hc_subgrp)
## 
##   1   2   3 
## 339 225 325
# Plot Replication dataset after using hierarchical agglomerative clustering
maxScores = c(3,4)

# Replication - make plot with all individuals shown as lines
df2use = Dverbal_Replication[,c("subjectkey",adi_total_vars2use)]
df2use$subgrp = factor(Dverbal_Replication$hc_subgrp)
df2use = data.frame(df2use)
df2use$subjectkey = factor(df2use$subjectkey)
df2use$SC = df2use$dbaes_atotal
df2use$RRB = df2use$dbaes_btotal

df4plot = melt(df2use,
               id.vars = c("subjectkey","subgrp"),
               measure.vars = c("SC","RRB"))

p = ggplot(data = df4plot, aes(x = variable,
                               y = value,
                               colour = subgrp,
                               group = subjectkey)) + facet_grid(. ~ subgrp)
p = p + geom_point(shape=1) + geom_line(alpha = 0.2) + guides(color=FALSE)
p = p + ylab("Percent Severity") + xlab("ADI-R subscale")
p

# scatterplot
p_rep = ggplot(data = Dverbal_Replication, aes(x = dbaes_atotal, y = dbaes_btotal, colour = factor(hc_subgrp))) + geom_point() + xlab("Social-Communication") + ylab("Restricted Repetitive Behaviors") + ggtitle("NDAR Replication")
p_rep

table(Dverbal_Replication$hc_subgrp)
## 
##   1   2   3 
## 357 248 285
# Subtype using hierarchical agglomerative clustering, but use NbClust to find optimal number of clusters
nbc_disc_res = NbClust(data = Dverbal_Discovery[,vars2use], method = "ward.D2")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 6 proposed 2 as the best number of clusters 
## * 3 proposed 3 as the best number of clusters 
## * 7 proposed 4 as the best number of clusters 
## * 1 proposed 5 as the best number of clusters 
## * 1 proposed 6 as the best number of clusters 
## * 1 proposed 8 as the best number of clusters 
## * 1 proposed 14 as the best number of clusters 
## * 3 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  4 
##  
##  
## *******************************************************************
nbc_rep_res = NbClust(data = Dverbal_Replication[,vars2use], method = "ward.D2")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 5 proposed 2 as the best number of clusters 
## * 6 proposed 3 as the best number of clusters 
## * 2 proposed 5 as the best number of clusters 
## * 5 proposed 7 as the best number of clusters 
## * 1 proposed 9 as the best number of clusters 
## * 1 proposed 13 as the best number of clusters 
## * 3 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************
# Plot Discovery dataset after using hierarchical agglomerative clustering and NbClust

# Discovery set
maxScores = c(3,4)

# Discovery - make plot with all individuals shown as lines
df2use = Dverbal_Discovery[,c("subjectkey",adi_total_vars2use)]
df2use$subgrp = factor(nbc_disc_res$Best.partition)
df2use = data.frame(df2use)
df2use$subjectkey = factor(df2use$subjectkey)
df2use$SC = df2use$dbaes_atotal
df2use$RRB = df2use$dbaes_btotal

df4plot = melt(df2use,
               id.vars = c("subjectkey","subgrp"),
               measure.vars = c("SC","RRB"))

p = ggplot(data = df4plot, aes(x = variable,
                               y = value,
                               colour = subgrp,
                               group = subjectkey)) + facet_grid(. ~ subgrp)
p = p + geom_point(shape=1) + geom_line(alpha = 0.2) + guides(color=FALSE)
p = p + ylab("Percent Severity") + xlab("ADI-R subscale")
p

# scatterplot
Dverbal_Discovery$nbclust_subgrp = factor(nbc_disc_res$Best.partition)
p_disc = ggplot(data = Dverbal_Discovery, aes(x = dbaes_atotal, y = dbaes_btotal, colour = factor(nbclust_subgrp))) + geom_point() + xlab("Social-Communication") + ylab("Restricted Repetitive Behaviors") + ggtitle("NDAR Discovery")
p_disc

table(Dverbal_Discovery$nbclust_subgrp)
## 
##   1   2   3   4 
## 149 369 171 200
# Plot Replication dataset after using hierarchical agglomerative clustering
maxScores = c(3,4)

# Replication - make plot with all individuals shown as lines
df2use = Dverbal_Replication[,c("subjectkey",adi_total_vars2use)]
df2use$subgrp = factor(nbc_rep_res$Best.partition)
df2use = data.frame(df2use)
df2use$subjectkey = factor(df2use$subjectkey)
df2use$SC = df2use$dbaes_atotal
df2use$RRB = df2use$dbaes_btotal

df4plot = melt(df2use,
               id.vars = c("subjectkey","subgrp"),
               measure.vars = c("SC","RRB"))

p = ggplot(data = df4plot, aes(x = variable,
                               y = value,
                               colour = subgrp,
                               group = subjectkey)) + facet_grid(. ~ subgrp)
p = p + geom_point(shape=1) + geom_line(alpha = 0.2) + guides(color=FALSE)
p = p + ylab("Percent Severity") + xlab("ADI-R subscale")
p

# scatterplot
Dverbal_Replication$nbclust_subgrp = factor(nbc_rep_res$Best.partition)
p_rep = ggplot(data = Dverbal_Replication, aes(x = dbaes_atotal, y = dbaes_btotal, colour = factor(nbclust_subgrp))) + geom_point() + xlab("Social-Communication") + ylab("Restricted Repetitive Behaviors") + ggtitle("NDAR Replication")
p_rep

table(Dverbal_Replication$nbclust_subgrp)
## 
##   1   2   3 
## 450 353  87

SC-RRB difference z = 0.5

#------------------------------------------------------------------------------
# Subtyping using Z-score of the difference between SC and RRB
# Z-score threshold to use for subtyping
z_thresh = 0.5

# vars2use = c("dbaes_atotal","dbaes_btotal")

# compute Discovery mean and sd
ds_disc = Dverbal_Discovery[,vars2use[1]] - Dverbal_Discovery[,vars2use[2]]
mean2use = mean(ds_disc)
sd2use = sd(ds_disc)

Dverbal_Discovery = make_subtype(data2use = Dverbal_Discovery,
                                 z_thresh = z_thresh,
                                 mean2use = mean2use,
                                 sd2use = sd2use)

# compute Replication mean and sd
ds_rep = Dverbal_Replication[,vars2use[1]] - Dverbal_Replication[,vars2use[2]]
mean2use = mean(ds_rep)
sd2use = sd(ds_rep)

Dverbal_Replication = make_subtype(data2use = Dverbal_Replication,
                                 z_thresh = z_thresh,
                                 mean2use = mean2use,
                                 sd2use = sd2use)

#------------------------------------------------------------------------------
# Table of subtypes X sex

# Discovery
table(Dverbal_Discovery$z_ds_group,Dverbal_Discovery$sex)
##               
##                  F   M
##   RRB_over_SC   55 227
##   SC_equal_RRB  82 256
##   SC_over_RRB   60 209
cs_res = chisq.test(Dverbal_Discovery$z_ds_group,Dverbal_Discovery$sex)
cs_res
## 
##  Pearson's Chi-squared test
## 
## data:  Dverbal_Discovery$z_ds_group and Dverbal_Discovery$sex
## X-squared = 2.0214, df = 2, p-value = 0.364
# Replication
table(Dverbal_Replication$z_ds_group,Dverbal_Replication$sex)
##               
##                  F   M
##   RRB_over_SC   51 214
##   SC_equal_RRB  93 281
##   SC_over_RRB   52 199
cs_res = chisq.test(Dverbal_Replication$z_ds_group,Dverbal_Replication$sex)
cs_res
## 
##  Pearson's Chi-squared test
## 
## data:  Dverbal_Replication$z_ds_group and Dverbal_Replication$sex
## X-squared = 3.2006, df = 2, p-value = 0.2018
#------------------------------------------------------------------------------
# Descriptive stats

# Discovery
df2use = Dverbal_Discovery[,c("subjectkey","dataset_id","collection_id","interview_age","sex","z_ds_group","ados_age","ados_sa_css","ados_rrb_css","iq","dbaes_atotal","dbaes_btotal")]
df2use$age = df2use$interview_age/12

cols2use = c("z_ds_group","sex","age","ados_age","ados_sa_css","ados_rrb_css","iq","dbaes_atotal","dbaes_btotal")
res=describeBy(x = df2use[,cols2use], group = "z_ds_group")
res
## 
##  Descriptive statistics by group 
## group: RRB_over_SC
##              vars   n   mean    sd median trimmed   mad   min    max  range
## z_ds_group*     1 282   1.00  0.00   1.00    1.00  0.00  1.00   1.00   0.00
## sex*            2 282    NaN    NA     NA     NaN    NA   Inf   -Inf   -Inf
## age             3 282   9.89  4.20   9.33    9.55  3.71  2.00  27.17  25.17
## ados_age        4  39  94.54 47.03  86.00   92.00 57.82 27.00 202.00 175.00
## ados_sa_css     5  39   6.49  2.28   7.00    6.55  2.97  2.00  10.00   8.00
## ados_rrb_css    6  39   7.87  1.89   8.00    8.09  1.48  1.00  10.00   9.00
## iq              7  76 100.91 19.83 102.00  102.69 17.79 42.00 139.00  97.00
## dbaes_atotal    8 282   0.23  0.11   0.22    0.23  0.12  0.00   0.51   0.51
## dbaes_btotal    9 282   0.45  0.13   0.45    0.45  0.13  0.14   0.79   0.65
##               skew kurtosis   se
## z_ds_group*    NaN      NaN 0.00
## sex*            NA       NA   NA
## age           0.88     1.18 0.25
## ados_age      0.45    -0.92 7.53
## ados_sa_css  -0.22    -1.04 0.37
## ados_rrb_css -1.34     2.55 0.30
## iq           -0.89     0.85 2.27
## dbaes_atotal  0.09    -0.65 0.01
## dbaes_btotal  0.10    -0.36 0.01
## ------------------------------------------------------------ 
## group: SC_equal_RRB
##              vars   n   mean    sd median trimmed   mad min    max  range  skew
## z_ds_group*     1 338   2.00  0.00   2.00    2.00  0.00   2   2.00   0.00   NaN
## sex*            2 338    NaN    NA     NA     NaN    NA Inf   -Inf   -Inf    NA
## age             3 338   9.31  5.95   8.25    8.46  4.82   0  45.75  45.75  2.08
## ados_age        4  54  77.09 40.60  60.50   72.30 34.84  33 182.00 149.00  0.85
## ados_sa_css     5  54   6.83  2.10   7.00    6.98  1.48   1  10.00   9.00 -0.58
## ados_rrb_css    6  54   7.65  2.34   8.00    8.05  1.48   1  10.00   9.00 -1.58
## iq              7  86 106.20 15.96 107.00  106.57 17.79  64 138.00  74.00 -0.22
## dbaes_atotal    8 338   0.29  0.13   0.29    0.30  0.14   0   0.67   0.67 -0.02
## dbaes_btotal    9 338   0.30  0.13   0.30    0.31  0.13   0   0.68   0.68 -0.10
##              kurtosis   se
## z_ds_group*       NaN 0.00
## sex*               NA   NA
## age              6.83 0.32
## ados_age        -0.37 5.52
## ados_sa_css      0.12 0.29
## ados_rrb_css     2.13 0.32
## iq              -0.46 1.72
## dbaes_atotal    -0.31 0.01
## dbaes_btotal    -0.16 0.01
## ------------------------------------------------------------ 
## group: SC_over_RRB
##              vars   n   mean    sd median trimmed   mad   min    max  range
## z_ds_group*     1 269   3.00  0.00   3.00    3.00  0.00  3.00   3.00   0.00
## sex*            2 269    NaN    NA     NA     NaN    NA   Inf   -Inf   -Inf
## age             3 269   7.39  5.02   5.92    6.52  2.97  1.67  37.33  35.67
## ados_age        4  60  72.73 37.62  66.00   67.00 29.65 30.00 172.00 142.00
## ados_sa_css     5  60   7.33  1.69   7.00    7.35  1.48  4.00  10.00   6.00
## ados_rrb_css    6  60   7.88  2.10   8.00    8.21  1.48  1.00  10.00   9.00
## iq              7  37 104.81 18.05 111.00  105.77 13.34 40.00 140.00 100.00
## dbaes_atotal    8 269   0.45  0.14   0.46    0.45  0.14  0.11   0.87   0.76
## dbaes_btotal    9 269   0.24  0.11   0.23    0.23  0.11  0.00   0.57   0.57
##               skew kurtosis   se
## z_ds_group*    NaN      NaN 0.00
## sex*            NA       NA   NA
## age           2.31     7.14 0.31
## ados_age      1.18     0.51 4.86
## ados_sa_css  -0.07    -0.88 0.22
## ados_rrb_css -1.63     3.19 0.27
## iq           -1.08     2.46 2.97
## dbaes_atotal  0.13    -0.12 0.01
## dbaes_btotal  0.25    -0.25 0.01
# Replication
df2use = Dverbal_Replication[,c("subjectkey","dataset_id","collection_id","interview_age","sex","z_ds_group","ados_age","ados_sa_css","ados_rrb_css","iq","dbaes_atotal","dbaes_btotal")]
df2use$age = df2use$interview_age/12

cols2use = c("z_ds_group","sex","age","ados_age","ados_sa_css","ados_rrb_css","iq","dbaes_atotal","dbaes_btotal")
res=describeBy(x = df2use[,cols2use], group = "z_ds_group")
res
## 
##  Descriptive statistics by group 
## group: RRB_over_SC
##              vars   n   mean    sd median trimmed   mad   min    max  range
## z_ds_group*     1 265   1.00  0.00   1.00    1.00  0.00  1.00   1.00   0.00
## sex*            2 265    NaN    NA     NA     NaN    NA   Inf   -Inf   -Inf
## age             3 265   9.80  5.00   9.17    9.23  4.08  2.58  30.67  28.08
## ados_age        4  29  85.28 46.93  76.00   80.64 51.89 36.00 196.00 160.00
## ados_sa_css     5  29   6.86  2.00   7.00    6.92  1.48  3.00  10.00   7.00
## ados_rrb_css    6  29   7.48  1.99   8.00    7.64  1.48  1.00  10.00   9.00
## iq              7  77 102.53 18.34 104.00  103.24 16.31 57.00 152.00  95.00
## dbaes_atotal    8 265   0.23  0.11   0.23    0.23  0.11  0.01   0.61   0.60
## dbaes_btotal    9 265   0.47  0.13   0.46    0.46  0.11  0.15   0.93   0.78
##               skew kurtosis   se
## z_ds_group*    NaN      NaN 0.00
## sex*            NA       NA   NA
## age           1.40     2.77 0.31
## ados_age      0.84    -0.30 8.71
## ados_sa_css  -0.21    -0.83 0.37
## ados_rrb_css -1.10     1.62 0.37
## iq           -0.27     0.41 2.09
## dbaes_atotal  0.57     0.64 0.01
## dbaes_btotal  0.43     0.70 0.01
## ------------------------------------------------------------ 
## group: SC_equal_RRB
##              vars   n   mean    sd median trimmed   mad min    max  range  skew
## z_ds_group*     1 374   2.00  0.00   2.00    2.00  0.00   2   2.00   0.00   NaN
## sex*            2 374    NaN    NA     NA     NaN    NA Inf   -Inf   -Inf    NA
## age             3 374   8.89  5.05   8.25    8.22  4.45   0  33.83  33.83  1.59
## ados_age        4  66  81.97 39.90  77.00   78.04 47.44  35 188.00 153.00  0.65
## ados_sa_css     5  66   6.85  2.14   7.00    6.91  2.97   2  10.00   8.00 -0.13
## ados_rrb_css    6  66   7.11  2.60   8.00    7.46  1.48   1  10.00   9.00 -1.18
## iq              7  72 108.38 15.57 108.00  107.78 14.08  69 146.00  77.00  0.29
## dbaes_atotal    8 374   0.30  0.14   0.30    0.30  0.14   0   0.74   0.74  0.02
## dbaes_btotal    9 374   0.32  0.14   0.31    0.32  0.14   0   0.81   0.81  0.08
##              kurtosis   se
## z_ds_group*       NaN 0.00
## sex*               NA   NA
## age              3.92 0.26
## ados_age        -0.51 4.91
## ados_sa_css     -0.88 0.26
## ados_rrb_css     0.56 0.32
## iq              -0.12 1.84
## dbaes_atotal    -0.23 0.01
## dbaes_btotal    -0.02 0.01
## ------------------------------------------------------------ 
## group: SC_over_RRB
##              vars   n   mean    sd median trimmed   mad   min    max  range
## z_ds_group*     1 251   3.00  0.00   3.00    3.00  0.00  3.00   3.00   0.00
## sex*            2 251    NaN    NA     NA     NaN    NA   Inf   -Inf   -Inf
## age             3 251   7.95  6.06   6.08    6.73  3.21  0.00  40.92  40.92
## ados_age        4  58  72.14 28.95  66.50   69.04 25.20 30.00 141.00 111.00
## ados_sa_css     5  58   7.03  1.86   7.00    7.06  1.48  3.00  10.00   7.00
## ados_rrb_css    6  58   7.60  2.05   8.00    7.79  1.48  1.00  10.00   9.00
## iq              7  37 110.92 17.82 111.00  111.68 17.79 62.00 146.00  84.00
## dbaes_atotal    8 251   0.47  0.13   0.46    0.46  0.14  0.14   0.96   0.82
## dbaes_btotal    9 251   0.25  0.12   0.25    0.25  0.12  0.00   0.66   0.66
##               skew kurtosis   se
## z_ds_group*    NaN      NaN 0.00
## sex*            NA       NA   NA
## age           2.21     5.71 0.38
## ados_age      0.99     0.19 3.80
## ados_sa_css   0.03    -0.78 0.24
## ados_rrb_css -1.09     1.45 0.27
## iq           -0.52    -0.12 2.93
## dbaes_atotal  0.41     0.30 0.01
## dbaes_btotal  0.14    -0.11 0.01
#------------------------------------------------------------------------------
# Tests of differences in interview age across the subtypes

# Discovery
mod2use = lm(formula = interview_age ~ z_ds_group, data = Dverbal_Discovery)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: interview_age
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## z_ds_group   2  136192   68096  17.714 2.861e-08 ***
## Residuals  886 3405874    3844                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Replication
mod2use = lm(formula = interview_age ~ z_ds_group, data = Dverbal_Replication)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: interview_age
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## z_ds_group   2   63457   31728  7.7318 0.0004689 ***
## Residuals  887 3639907    4104                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#------------------------------------------------------------------------------
# Tests of differences in SC across the subtypes

# Discovery
mod2use = lm(formula = dbaes_atotal ~ z_ds_group, data = Dverbal_Discovery)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: dbaes_atotal
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## z_ds_group   2  7.1174  3.5587  210.86 < 2.2e-16 ***
## Residuals  886 14.9533  0.0169                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Replication
mod2use = lm(formula = dbaes_atotal ~ z_ds_group, data = Dverbal_Replication)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: dbaes_atotal
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## z_ds_group   2  7.5809  3.7904  225.12 < 2.2e-16 ***
## Residuals  887 14.9345  0.0168                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#------------------------------------------------------------------------------
# Tests of differences in RRB across the subtypes

# Discovery
mod2use = lm(formula = dbaes_btotal ~ z_ds_group, data = Dverbal_Discovery)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: dbaes_btotal
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## z_ds_group   2  6.6165  3.3083  213.82 < 2.2e-16 ***
## Residuals  886 13.7083  0.0155                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Replication
mod2use = lm(formula = dbaes_btotal ~ z_ds_group, data = Dverbal_Replication)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: dbaes_btotal
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## z_ds_group   2  6.6364  3.3182  187.56 < 2.2e-16 ***
## Residuals  887 15.6922  0.0177                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#------------------------------------------------------------------------------
# Tests of differences in ADOS SA CSS across the subtypes

# Discovery
mod2use = lm(formula = ados_sa_css ~ z_ds_group, data = Dverbal_Discovery)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: ados_sa_css
##             Df Sum Sq Mean Sq F value Pr(>F)
## z_ds_group   2  17.89  8.9468  2.2346 0.1106
## Residuals  150 600.58  4.0038
# Replication
mod2use = lm(formula = ados_sa_css ~ z_ds_group, data = Dverbal_Replication)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: ados_sa_css
##             Df Sum Sq Mean Sq F value Pr(>F)
## z_ds_group   2   1.19  0.5973  0.1479 0.8627
## Residuals  150 605.86  4.0391
#------------------------------------------------------------------------------
# Tests of differences in ADOS RRB CSS across the subtypes

# Discovery
mod2use = lm(formula = ados_rrb_css ~ z_ds_group, data = Dverbal_Discovery)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: ados_rrb_css
##             Df Sum Sq Mean Sq F value Pr(>F)
## z_ds_group   2   1.86  0.9309  0.2033 0.8163
## Residuals  150 686.86  4.5790
# Replication
mod2use = lm(formula = ados_rrb_css ~ z_ds_group, data = Dverbal_Replication)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: ados_rrb_css
##             Df Sum Sq Mean Sq F value Pr(>F)
## z_ds_group   2   8.13  4.0625   0.772 0.4639
## Residuals  150 789.38  5.2625
#------------------------------------------------------------------------------
# Tests of differences in IQ across the subtypes

# Discovery
mod2use = lm(formula = iq ~ z_ds_group, data = Dverbal_Discovery)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: iq
##             Df Sum Sq Mean Sq F value Pr(>F)
## z_ds_group   2   1165  582.52  1.8162 0.1654
## Residuals  196  62864  320.73
# Replication
mod2use = lm(formula = iq ~ z_ds_group, data = Dverbal_Replication)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: iq
##             Df Sum Sq Mean Sq F value  Pr(>F)  
## z_ds_group   2   2187 1093.72  3.6915 0.02681 *
## Residuals  183  54219  296.28                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#------------------------------------------------------------------------------
# Make scatterplots with difference score Z subtypes in different colors
maxScores = c(3,4)

p_disc = ggplot(data = Dverbal_Discovery, aes(x = dbaes_atotal, y = dbaes_btotal, colour = z_ds_group)) + geom_point() + xlab("SC") + ylab("RRB") + ylim(0,1) + xlim(0,1) + ggtitle("NDAR Discovery")
p1_top_left = p_disc + guides(colour=FALSE)
ggsave(filename = file.path(plotpath, sprintf("final_NDAR_Disc_scatterplot_z%s.pdf",as.character(z_thresh))), plot = p1_top_left)
p_disc

table(Dverbal_Discovery$z_ds_group)
## 
##  RRB_over_SC SC_equal_RRB  SC_over_RRB 
##          282          338          269
cor_res = cor.test(Dverbal_Discovery$dbaes_atotal, Dverbal_Discovery$dbaes_btotal)
cor_res
## 
##  Pearson's product-moment correlation
## 
## data:  Dverbal_Discovery$dbaes_atotal and Dverbal_Discovery$dbaes_btotal
## t = 6.6829, df = 887, p-value = 4.134e-11
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1554332 0.2806578
## sample estimates:
##       cor 
## 0.2189469
p_rep = ggplot(data = Dverbal_Replication, aes(x = dbaes_atotal, y = dbaes_btotal, colour = z_ds_group)) + geom_point() + xlab("SC") + ylab("RRB") + ylim(0,1) + xlim(0,1) +  ggtitle("NDAR Replication")
p2_bottom_left = p_rep + guides(colour=FALSE)
ggsave(filename = file.path(plotpath, sprintf("final_NDAR_Rep_scatterplot_z%s.pdf",as.character(z_thresh))), plot = p2_bottom_left)
p_rep

table(Dverbal_Replication$z_ds_group)
## 
##  RRB_over_SC SC_equal_RRB  SC_over_RRB 
##          265          374          251
cor_res = cor.test(Dverbal_Replication$dbaes_atotal, Dverbal_Replication$dbaes_btotal)
cor_res
## 
##  Pearson's product-moment correlation
## 
## data:  Dverbal_Replication$dbaes_atotal and Dverbal_Replication$dbaes_btotal
## t = 7.1459, df = 888, p-value = 1.864e-12
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1700824 0.2943934
## sample estimates:
##       cor 
## 0.2331903
#------------------------------------------------------------------------------
# Run supervised model with Discovery as training and Replication as Test

# run validation
# make subtypes using z-scores computed from the mean and sd of the training set
train_data = Dverbal_Discovery
test_data = Dverbal_Replication

mean2use = mean(train_data[,vars2use[1]] - train_data[,vars2use[2]])
sd2use = sd(train_data[,vars2use[1]] - train_data[,vars2use[2]])
tmp_train = make_subtype(data2use = train_data,
                         z_thresh = z_thresh,
                         mean2use = mean2use,
                         sd2use = sd2use)

mean2use = mean(test_data[,vars2use[1]] - test_data[,vars2use[2]])
sd2use = sd(test_data[,vars2use[1]] - test_data[,vars2use[2]])
tmp_test = make_subtype(data2use = test_data,
                        z_thresh = z_thresh,
                        mean2use = mean2use,
                        sd2use = sd2use)




#==============================================================================
#==============================================================================
# make labels based on train mean and sd
mean2use = mean(train_data[,vars2use[1]] - train_data[,vars2use[2]])
sd2use = sd(train_data[,vars2use[1]] - train_data[,vars2use[2]])
train_mean = mean2use
train_sd = sd2use

pred_labels = make_subtype(data2use = test_data,
                        z_thresh = z_thresh,
                        mean2use = train_mean,
                        sd2use = train_sd)
confmat = table(tmp_test$z_ds_group,pred_labels$z_ds_group)
acc = (confmat[1,1]+confmat[2,2]+confmat[3,3])/dim(pred_labels)[1]

# # compute model
# mod2use = svm(x = tmp_train[,vars2use], y = tmp_train$z_ds_group)
# pred_labels = predict(mod2use, tmp_test[,vars2use])
# confmat = table(tmp_test$z_ds_group,pred_labels)
# acc = (confmat[1,1]+confmat[2,2]+confmat[3,3])/length(pred_labels)
#==============================================================================
#==============================================================================




# plot confusion matrix
setHook("grid.newpage", function() pushViewport(viewport(x=1,y=1,width=0.9, height=0.9, name="vp", just=c("right","top"))), action="prepend")
pheatmap(confmat/rowSums(confmat)*100, display_numbers = confmat, color = colorRampPalette(c('white','red'))(100), cluster_rows = FALSE, cluster_cols = FALSE, fontsize_number = fontSize, fontsize_row = fontSize, fontsize_col = fontSize,labels_row = c("RRB>SC","SC=RRB","SC>RRB"),labels_col = c("RRB>SC","SC=RRB","SC>RRB"),angle_col=90,
         breaks= seq(0,100, length=100))
setHook("grid.newpage", NULL, "replace")
grid::grid.text("Actual Labels", y=-0.07, gp=gpar(fontsize=fontSize))
grid::grid.text("Predicted Labels", x=-0.07, rot=90, gp=gpar(fontsize=fontSize))

# show accuracy
true_accuracy = acc
true_accuracy
## [1] 0.9775281
#================================================================================
#================================================================================
# #------------------------------------------------------------------------------
# # Permute subtype labels to examine how well supervised model performs
#
# # nperm = 10000
#
# # make subtypes using z-scores computed from the mean and sd of the training set
# train_data = Dverbal_Discovery
# test_data = Dverbal_Replication
# mean2use = mean(train_data[,vars2use[1]] - train_data[,vars2use[2]])
# sd2use = sd(train_data[,vars2use[1]] - train_data[,vars2use[2]])
# tmp_train = make_subtype(data2use = train_data,
#                          z_thresh = z_thresh,
#                          mean2use = mean2use,
#                          sd2use = sd2use)
#
# mean2use = mean(test_data[,vars2use[1]] - test_data[,vars2use[2]])
# sd2use = sd(test_data[,vars2use[1]] - test_data[,vars2use[2]])
# tmp_test = make_subtype(data2use = test_data,
#                         z_thresh = z_thresh,
#                         mean2use = mean2use,
#                         sd2use = sd2use)
#
# acc = vector(length = nperm)
# for (iperm in 1:nperm){
#   # set seed for reproducibility
#   set.seed(iperm)
#
#   sc_perm = sample(train_data[,vars2use[1]])
#   rrb_perm = sample(train_data[,vars2use[2]])
#   perm_mean2use = mean(sc_perm - rrb_perm)
#   perm_sd2use = sd(sc_perm - rrb_perm)
#   # perm_mean2use = mean(train_data[,vars2use[1]] - rrb_perm)
#   # perm_sd2use = sd(train_data[,vars2use[1]] - rrb_perm)
#   pred_labels = make_subtype(data2use = tmp_test,
#                         z_thresh = z_thresh,
#                         mean2use = perm_mean2use,
#                         sd2use = perm_sd2use)
#   confmat = table(tmp_test$z_ds_group,pred_labels$z_ds_group)
#   acc = (confmat[1,1]+confmat[2,2]+confmat[3,3])/dim(pred_labels)[1]
#
#   # compute model
#   permuted_labels = sample(tmp_train$z_ds_group)
#   mod2use = svm(x = tmp_train[,vars2use], y = permuted_labels)
#   pred_labels = predict(mod2use, tmp_test[,vars2use])
#   confmat = table(tmp_test$z_ds_group,pred_labels)
#   acc[iperm] = (confmat[1,1]+confmat[2,2]+confmat[3,3])/length(pred_labels)
#   #============================================================================
# } # for (iperm in 1:nperm)
#
# df2plot = data.frame(Accuracy = acc)
# p = ggplot(data = df2plot, aes(x = Accuracy)) + geom_histogram() + geom_vline(xintercept=true_accuracy)
# p
#
# # compute p-value
# pval = sum(c(true_accuracy,acc)>=true_accuracy)/(nperm+1)
# pval
#================================================================================
#================================================================================




#------------------------------------------------------------------------------
# Plot difference score Z subtypes in Discovery set
maxScores = c(3,4)

# Discovery - make plot with all individuals shown as lines
df2use = Dverbal_Discovery[,c("subjectkey",adi_total_vars2use)]
df2use$subgrp = factor(tmp_train$z_ds_group)
df2use = data.frame(df2use)
df2use$subjectkey = factor(df2use$subjectkey)
df2use$SC = df2use$dbaes_atotal
df2use$RRB = df2use$dbaes_btotal

df4plot = melt(df2use,
               id.vars = c("subjectkey","subgrp"),
               measure.vars = c("SC","RRB"))

p = ggplot(data = df4plot, aes(x = variable,
                               y = value,
                               colour = subgrp,
                               group = subjectkey)) + facet_grid(. ~ subgrp)
p = p + geom_point(shape=1) + geom_line(alpha = 0.2) + ylim(0,1) + guides(color=FALSE)
p = p + ylab("Percent Severity") + xlab("ADI-R subscale")
p3_middle_top = p + guides(colour=FALSE)
ggsave(filename = file.path(plotpath, sprintf("final_NDAR_Disc_jitterplot_z%s.pdf",as.character(z_thresh))), plot = p3_middle_top)
p

# Plot difference score Z subtypes in Replication set
maxScores = c(3,4)

# Replication - make plot with all individuals shown as lines
df2use = Dverbal_Replication[,c("subjectkey",adi_total_vars2use)]
df2use$subgrp = factor(tmp_test$z_ds_group)
df2use = data.frame(df2use)
df2use$subjectkey = factor(df2use$subjectkey)
df2use$SC = df2use$dbaes_atotal
df2use$RRB = df2use$dbaes_btotal

df4plot = melt(df2use,
               id.vars = c("subjectkey","subgrp"),
               measure.vars = c("SC","RRB"))

p = ggplot(data = df4plot, aes(x = variable,
                               y = value,
                               colour = subgrp,
                               group = subjectkey)) + facet_grid(. ~ subgrp)
p = p + geom_point(shape=1) + geom_line(alpha = 0.2) + ylim(0,1) + guides(color=FALSE)
p = p + ylab("Percent Severity") + xlab("ADI-R subscale")
p4_middle_bottom = p + guides(colour=FALSE)
ggsave(filename = file.path(plotpath, sprintf("final_NDAR_Rep_jitterplot_z%s.pdf",as.character(z_thresh))), plot = p4_middle_bottom)
p

#------------------------------------------------------------------------------
# Apply NDAR subtypes to EU-AIMS LEAP data

# make SC and RRB percentages since EU-AIMS data is specified as percentages
Dverbal = read.csv(file.path(datapath,"tidy_verbal.csv"))
euaims_data = read.csv(file.path(datapath,"tidy_euaims.csv"))
mask1 = euaims_data$Diagnosis=="ASD"
mask2 =  (is.na(euaims_data$A1_pct_severity) | is.na(euaims_data$A2_pct_severity) | is.na(euaims_data$A3_pct_severity) | is.na(euaims_data$B1_pct_severity) | is.na(euaims_data$B2_pct_severity) | is.na(euaims_data$B3_pct_severity) | is.na(euaims_data$B4_pct_severity))
euaims_data = subset(euaims_data, (mask1 & !mask2))

Dverbal[,vars2use[1]] = Dverbal[,vars2use[1]]
Dverbal[,vars2use[2]] = Dverbal[,vars2use[2]]
euaims_data[,vars2use[1]] = (euaims_data$A1_pct_severity +
                               euaims_data$A2_pct_severity +
                               euaims_data$A3_pct_severity)/3
euaims_data[,vars2use[2]] = (euaims_data$B1_pct_severity +
                               euaims_data$B2_pct_severity +
                               euaims_data$B3_pct_severity +
                               euaims_data$B4_pct_severity)/4

train_data = Dverbal
test_data = euaims_data

mean2use = mean(train_data[,vars2use[1]] - train_data[,vars2use[2]], na.rm=TRUE)
sd2use = sd(train_data[,vars2use[1]] - train_data[,vars2use[2]], na.rm=TRUE)
c(mean2use, sd2use)
## [1] -0.01045243  0.19482749
tmp_train = make_subtype(data2use = train_data,
                         z_thresh = z_thresh,
                         mean2use = mean2use,
                         sd2use = sd2use)

tmp_test = make_subtype(data2use = test_data,
                        z_thresh = z_thresh,
                        mean2use = mean2use,
                        sd2use = sd2use)




#===========================================================================
#===========================================================================
# # compute model
# mod2use = svm(x = tmp_train[,vars2use], y = tmp_train$z_ds_group)
# pred_labels = predict(mod2use, tmp_test[,vars2use])
# confmat = table(tmp_test$z_ds_group,pred_labels)
# acc = (confmat[1,1]+confmat[2,2]+confmat[3,3])/length(pred_labels)
#
# tmp_test$svm_pred_labels = pred_labels
#
# # plot confusion matrix
# setHook("grid.newpage", function() pushViewport(viewport(x=1,y=1,width=0.9, height=0.9, name="vp", just=c("right","top"))), action="prepend")
# pheatmap(confmat/rowSums(confmat)*100, display_numbers = confmat, color = colorRampPalette(c('white','red'))(100), cluster_rows = FALSE, cluster_cols = FALSE, fontsize_number = fontSize, fontsize_row = fontSize, fontsize_col = fontSize,labels_row = c("RRB>SC","SC=RRB","SC>RRB"),labels_col = c("RRB>SC","SC=RRB","SC>RRB"),angle_col=90,
#          breaks= seq(0,100, length=100))
# setHook("grid.newpage", NULL, "replace")
# grid::grid.text("Actual Labels", y=-0.07, gp=gpar(fontsize=fontSize))
# grid::grid.text("Predicted Labels", x=-0.07, rot=90, gp=gpar(fontsize=fontSize))
#===========================================================================
#===========================================================================


# scatterplot
p1 = ggplot(data = tmp_train, aes(x = dbaes_atotal, y = dbaes_btotal, colour = factor(z_ds_group))) + geom_point() + xlab("Social-Communication") + ylab("Restricted Repetitive Behaviors") + ylim(0,0.8) + xlim(0,0.8) + ggtitle("NDAR ALL")
p1

p2 = ggplot(data = tmp_test, aes(x = dbaes_atotal, y = dbaes_btotal, colour = factor(z_ds_group))) + geom_point() + xlab("Social-Communication") + ylab("Restricted Repetitive Behaviors") + ylim(0,0.8) + xlim(0,0.8) + ggtitle("EU-AIMS with Groups from NDAR ALL")
p2

#===========================================================================
#===========================================================================
# p3 = ggplot(data = tmp_test, aes(x = dbaes_atotal, y = dbaes_btotal, colour = factor(pred_labels))) + geom_point() + xlab("SC") + ylab("RRB") + ylim(0,0.8) + xlim(0,0.8) + ggtitle("EU-AIMS")
# p5_bottom_right = p3 + guides(colour=FALSE)
# ggsave(filename = file.path(plotpath, sprintf("final_EUAIMS_scatterplot_z%s.pdf",as.character(z_thresh))), plot = p5_bottom_right)
# p3
#===========================================================================
#===========================================================================

# # write out EU-AIMS LEAP data with subgroups defined by NDAR All
write.csv(tmp_test, file.path(datapath, sprintf("tidy_euaims_NDAR_subtypes_diffscore_z%s.csv",as.character(z_thresh))))

#===========================================================================
#===========================================================================
# # Make final plot
# p_final = p1_top_left + p3_middle_top + plot_spacer() + p2_bottom_left + p4_middle_bottom + p5_bottom_right + plot_layout(nrow=3, ncol=3, widths = c(4,4,4), heights = c(8,8,8))
# ggsave(filename = file.path(plotpath, sprintf("final_NDAR_EUAIMS_subtypes_plot_z%s.pdf",as.character(z_thresh))), plot = p_final)
# p_final
#===========================================================================
#===========================================================================

#------------------------------------------------------------------------------
# Integrate subtypes with rest of EU-AIMS LEAP data
euaims_data = read.csv(file.path(datapath,"tidy_euaims.csv"))
td_df = subset(euaims_data, euaims_data$Diagnosis=="TD",select=2:6)
td_df$A_pct_severity = as.numeric(NA)
td_df$B_pct_severity = as.numeric(NA)
td_df$subgrp = "TD"

#===========================================================================
#===========================================================================
# cols2use = c("subid","age","Centre","Schedule","Diagnosis","dbaes_atotal","dbaes_btotal","svm_pred_labels")
cols2use = c("subid","age","Centre","Schedule","Diagnosis","dbaes_atotal","dbaes_btotal","z_ds_group")
#===========================================================================
#===========================================================================
tmp_asd = tmp_test[,cols2use]
colnames(tmp_asd)[6] = "A_pct_severity"
colnames(tmp_asd)[7] = "B_pct_severity"
colnames(tmp_asd)[8] = "subgrp"
asd_df = tmp_asd

all_data = rbind(td_df,asd_df)

fname = "/Users/mlombardo/Dropbox/euaims/data/rsfmri_preproc/euaims_preproc.xlsx"
pp_data = read_excel(fname)
mask = pp_data$notes=="ok"
pp_data = subset(pp_data, mask)

asd_df = merge(pp_data[,c("subid","sex")],asd_df, by = "subid")
td_df = merge(pp_data[,c("subid","sex")],td_df, by = "subid")

all_data = rbind(td_df,asd_df)

data2write = merge(pp_data, all_data, by = "subid")
data2write$age = data2write$age.x
data2write$sex = data2write$sex.y
print(table(data2write$Schedule, data2write$subgrp))
##    
##     RRB_over_SC SC_equal_RRB SC_over_RRB TD
##   A           9           26          45 78
##   B           7           28          50 83
##   C           7           22          34 59
##   D           1            6          31 23
print(table(data2write$Centre, data2write$subgrp))
##                
##                 RRB_over_SC SC_equal_RRB SC_over_RRB TD
##   CAMBRIDGE               6           22          14 29
##   KINGS_COLLEGE          12           28          66 78
##   MANNHEIM                0            0           0 34
##   NIJMEGEN                6           26          61 64
##   UTRECHT                 0            6          19 38
print(table(data2write$sex, data2write$subgrp))
##         
##          RRB_over_SC SC_equal_RRB SC_over_RRB  TD
##   Female           7           21          43  88
##   Male            17           61         117 155
#DSM-5 - find best split that balances participants across sites
a = findBestSplit(asd_df, seed_range = c(172342)) #,300001:500000))
## [1] 172342
best_seeds = a$seed[!is.na(a$discrepancy) & a$discrepancy==min(a$discrepancy, na.rm = TRUE)]
print(best_seeds)
## [1] 172342
# Split datasets -------------------------------------------------------------
rngSeed = best_seeds[1]

# split Schedule A dataset
dset2use = subset(asd_df, asd_df$Schedule=="A")
tmp_d = SplitDatasetsBySex(dset2use, rngSeed = rngSeed)
A_Discovery = tmp_d[[2]]
A_Replication = tmp_d[[1]]

# split Schedule B dataset
dset2use = subset(asd_df, asd_df$Schedule=="B")
tmp_d = SplitDatasetsBySex(dset2use, rngSeed = rngSeed)
B_Discovery = tmp_d[[2]]
B_Replication = tmp_d[[1]]

# split Schedule C dataset
dset2use = subset(asd_df, asd_df$Schedule=="C")
tmp_d = SplitDatasetsBySex(dset2use, rngSeed = rngSeed)
C_Discovery = tmp_d[[2]]
C_Replication = tmp_d[[1]]

# split Schedule D dataset
dset2use = subset(asd_df, asd_df$Schedule=="D")
tmp_d = SplitDatasetsBySex(dset2use, rngSeed = rngSeed)
D_Discovery = tmp_d[[2]]
D_Replication = tmp_d[[1]]

df_Disc = rbind(A_Discovery, B_Discovery, C_Discovery, D_Discovery)
df_Rep = rbind(A_Replication, B_Replication, C_Replication, D_Replication)

a = table(df_Disc$Schedule, df_Disc$Centre); print(a)
##    
##     CAMBRIDGE KINGS_COLLEGE NIJMEGEN UTRECHT
##   A         6            17       10       7
##   B         9            16       14       3
##   C         6             9       14       2
##   D         0            10       10       0
b = table(df_Rep$Schedule, df_Rep$Centre); print(b)
##    
##     CAMBRIDGE KINGS_COLLEGE NIJMEGEN UTRECHT
##   A         7            18       10       5
##   B         8            17       13       5
##   C         6            10       13       3
##   D         0             9        9       0
print(a-b)
##    
##     CAMBRIDGE KINGS_COLLEGE NIJMEGEN UTRECHT
##   A        -1            -1        0       2
##   B         1            -1        1      -2
##   C         0            -1        1      -1
##   D         0             1        1       0
print(sum(rowSums(abs(a-b))))
## [1] 14
data2write$dataset = NA
mask  = is.element(data2write$subid,df_Disc$subid)
data2write[mask,"dataset"] = "Discovery"
mask  = is.element(data2write$subid,df_Rep$subid)
data2write[mask,"dataset"] = "Replication"

asd_Disc = subset(data2write, data2write$dataset=="Discovery" & data2write$Diagnosis=="ASD")
asd_Rep = subset(data2write, data2write$dataset=="Replication" & data2write$Diagnosis=="ASD")

# # find which seed gives best TD age-match -------------------------------------
# seeds = 1:1000
# pvals = data.frame(matrix(nrow = length(seeds), ncol = 2))
# for (i in 1:length(seeds)) {
#   res = findTDAgeMatch(data2write, seed_range = c(seeds[i],seeds[i]))
#   td_Disc_matched = res[[2]]
#   td_Rep_matched = res[[1]]
#   tres = t.test(td_Disc_matched$age, asd_Disc$age)
#   pvals[i,1] = tres$p.value
#   tres = t.test(td_Rep_matched$age, asd_Rep$age)
#   pvals[i,2] = tres$p.value
#   #print(i)
# }
# a = sort.int(pvals[,1], decreasing = TRUE, index.return = TRUE)
# b=pvals[a$ix,]

seed2use = 929
res = findTDAgeMatch(data2write, seed_range = c(seed2use,seed2use))
td_Disc_matched = res[[2]]
td_Rep_matched = res[[1]]

mask = is.element(data2write$subid, td_Disc_matched$subid)
data2write$dataset[mask] = "Discovery"

mask = is.element(data2write$subid, td_Rep_matched$subid)
data2write$dataset[mask] = "Replication"

print(table(data2write$dataset, data2write$subgrp))
##              
##               RRB_over_SC SC_equal_RRB SC_over_RRB  TD
##   Discovery            17           38          78 121
##   Replication           7           44          82 122
fname2save = here(sprintf("asd_subgrp_data_rsfmri_ALL_DSM5_diffzscoreGrps_z%s.csv",as.character(z_thresh)))
write.csv(data2write,file = fname2save)


#------------------------------------------------------------------------------
# Descriptive stats
df2use = data2write[,c("subid","Diagnosis","dataset","Centre","meanFD","age","sex","subgrp","viq_all","piq_all","fsiq4_all","A_pct_severity","B_pct_severity","ADI_social_total","ADI_communication_total","ADI_RRB_total","ados_2_SA_CSS","ados_2_RRB_CSS","SRS_tscore","SRS_tscore_self","RBS_total","SSP_total","vabsdscoresc_dss","vabsdscoresd_dss","vabsdscoress_dss","vabsabcabc_standard")]
mask = df2use==999 | df2use==777
df2use[mask] = NA
df2use$age = df2use$age/365

cols2use = c("dataset","subgrp","age","meanFD","viq_all","piq_all","fsiq4_all","A_pct_severity","B_pct_severity","ADI_social_total","ADI_communication_total","ADI_RRB_total","ados_2_SA_CSS","ados_2_RRB_CSS","SRS_tscore","SRS_tscore_self","RBS_total","SSP_total","vabsdscoresc_dss","vabsdscoresd_dss","vabsdscoress_dss","vabsabcabc_standard")
res=describeBy(x = df2use[,cols2use], group = c("subgrp","dataset"))
res
## 
##  Descriptive statistics by group 
## subgrp: RRB_over_SC
## dataset: Discovery
##                         vars  n   mean    sd median trimmed   mad    min    max
## dataset*                   1 17    NaN    NA     NA     NaN    NA    Inf   -Inf
## subgrp*                    2 17    NaN    NA     NA     NaN    NA    Inf   -Inf
## age                        3 17  16.57  6.24  15.83   16.69  9.67   7.08  24.29
## meanFD                     4 17   0.25  0.24   0.18    0.20  0.05   0.06   1.14
## viq_all                    5 17  97.45 15.81  96.00   97.32 20.76  73.00 123.85
## piq_all                    6 17  96.82 17.03  96.00   96.87 16.31  64.00 129.00
## fsiq4_all                  7 17  96.90 15.72  96.00   97.49 20.76  67.00 118.01
## A_pct_severity             8 17   0.21  0.14   0.15    0.21  0.15   0.00   0.49
## B_pct_severity             9 17   0.41  0.14   0.42    0.41  0.16   0.13   0.61
## ADI_social_total          10 17  16.29  7.69  16.00   16.60  8.90   2.00  26.00
## ADI_communication_total   11 17  14.29  6.72  15.00   14.47  7.41   2.00  24.00
## ADI_RRB_total             12 17   7.29  2.23   7.00    7.40  2.97   3.00  10.00
## ados_2_SA_CSS             13 17   4.76  2.93   3.00    4.73  2.97   1.00   9.00
## ados_2_RRB_CSS            14 17   4.76  3.83   5.00    4.67  5.93   1.00  10.00
## SRS_tscore                15 12  74.33 10.35  74.00   74.40 12.60  58.00  90.00
## SRS_tscore_self           16  7  61.29  6.13  63.00   61.29  2.97  49.00  68.00
## RBS_total                 17 11  17.45  9.82  19.00   17.00 14.83   6.00  33.00
## SSP_total                 18  7 139.00 20.08 140.00  139.00 28.17 116.00 167.00
## vabsdscoresc_dss          19 13  72.77 21.44  77.00   72.91 14.83  29.00 115.00
## vabsdscoresd_dss          20 13  65.62 13.04  68.00   67.18  8.90  31.00  83.00
## vabsdscoress_dss          21 13  67.38 13.88  69.00   68.55 16.31  35.00  87.00
## vabsabcabc_standard       22 13  70.15  9.17  70.00   69.64 10.38  58.00  88.00
##                         range  skew kurtosis   se
## dataset*                 -Inf    NA       NA   NA
## subgrp*                  -Inf    NA       NA   NA
## age                     17.20 -0.10    -1.68 1.51
## meanFD                   1.09  2.88     7.83 0.06
## viq_all                 50.85 -0.11    -1.38 3.84
## piq_all                 65.00 -0.27    -0.55 4.13
## fsiq4_all               51.01 -0.28    -1.21 3.81
## A_pct_severity           0.49  0.38    -1.15 0.03
## B_pct_severity           0.48 -0.31    -1.15 0.03
## ADI_social_total        24.00 -0.26    -1.41 1.86
## ADI_communication_total 22.00 -0.22    -1.19 1.63
## ADI_RRB_total            7.00 -0.41    -1.03 0.54
## ados_2_SA_CSS            8.00  0.23    -1.73 0.71
## ados_2_RRB_CSS           9.00  0.11    -1.90 0.93
## SRS_tscore              32.00  0.11    -1.46 2.99
## SRS_tscore_self         19.00 -0.92    -0.46 2.32
## RBS_total               27.00  0.21    -1.60 2.96
## SSP_total               51.00  0.13    -1.88 7.59
## vabsdscoresc_dss        86.00 -0.30    -0.05 5.95
## vabsdscoresd_dss        52.00 -1.17     1.20 3.62
## vabsdscoress_dss        52.00 -0.69    -0.17 3.85
## vabsabcabc_standard     30.00  0.24    -1.05 2.54
## ------------------------------------------------------------ 
## subgrp: SC_equal_RRB
## dataset: Discovery
##                         vars  n   mean    sd median trimmed   mad   min    max
## dataset*                   1 38    NaN    NA     NA     NaN    NA   Inf   -Inf
## subgrp*                    2 38    NaN    NA     NA     NaN    NA   Inf   -Inf
## age                        3 38  16.43  6.01  14.92   16.02  4.75  7.75  30.28
## meanFD                     4 38   0.36  0.63   0.24    0.25  0.17  0.04   3.95
## viq_all                    5 38  97.98 18.61 102.00   97.87 16.29 64.00 136.00
## piq_all                    6 38 101.00 19.23 104.50  101.52 17.03 61.00 142.00
## fsiq4_all                  7 38  99.39 17.45 105.00   99.91 20.02 60.00 131.00
## A_pct_severity             8 38   0.32  0.14   0.31    0.31  0.13  0.03   0.63
## B_pct_severity             9 38   0.33  0.15   0.31    0.32  0.13  0.02   0.69
## ADI_social_total          10 38  17.55  7.28  19.00   17.94  6.67  3.00  27.00
## ADI_communication_total   11 38  14.42  6.13  14.00   14.50  6.67  0.00  26.00
## ADI_RRB_total             12 38   5.63  2.40   6.00    5.62  2.22  1.00  12.00
## ados_2_SA_CSS             13 37   6.38  2.56   7.00    6.48  2.97  1.00  10.00
## ados_2_RRB_CSS            14 37   5.24  2.55   6.00    5.32  1.48  1.00  10.00
## SRS_tscore                15 36  71.42 11.97  74.00   71.67 11.86 47.00  90.00
## SRS_tscore_self           16 22  61.86 12.11  61.50   61.39 14.08 43.00  89.00
## RBS_total                 17 34  17.06 13.32  15.00   15.43 10.38  0.00  53.00
## SSP_total                 18 23 134.35 31.32 137.00  134.58 44.48 81.00 187.00
## vabsdscoresc_dss          19 37  72.24 18.80  75.00   73.10 11.86 21.00 122.00
## vabsdscoresd_dss          20 36  72.81 15.98  72.50   73.50 11.86 25.00 105.00
## vabsdscoress_dss          21 37  71.59 16.35  74.00   73.13 11.86 20.00  95.00
## vabsabcabc_standard       22 36  70.03 14.39  72.00   71.17  9.64 20.00 100.00
##                          range  skew kurtosis   se
## dataset*                  -Inf    NA       NA   NA
## subgrp*                   -Inf    NA       NA   NA
## age                      22.53  0.64    -0.62 0.98
## meanFD                    3.91  4.98    25.67 0.10
## viq_all                  72.00 -0.15    -0.77 3.02
## piq_all                  81.00 -0.33    -0.45 3.12
## fsiq4_all                71.00 -0.29    -0.74 2.83
## A_pct_severity            0.60  0.18    -0.44 0.02
## B_pct_severity            0.67  0.40    -0.37 0.02
## ADI_social_total         24.00 -0.51    -0.97 1.18
## ADI_communication_total  26.00 -0.18    -0.67 0.99
## ADI_RRB_total            11.00  0.07    -0.16 0.39
## ados_2_SA_CSS             9.00 -0.30    -0.97 0.42
## ados_2_RRB_CSS            9.00 -0.56    -0.77 0.42
## SRS_tscore               43.00 -0.29    -1.05 1.99
## SRS_tscore_self          46.00  0.24    -0.92 2.58
## RBS_total                53.00  1.11     0.75 2.28
## SSP_total               106.00 -0.08    -1.16 6.53
## vabsdscoresc_dss        101.00 -0.37     1.24 3.09
## vabsdscoresd_dss         80.00 -0.51     0.85 2.66
## vabsdscoress_dss         75.00 -1.09     1.28 2.69
## vabsabcabc_standard      80.00 -1.12     2.65 2.40
## ------------------------------------------------------------ 
## subgrp: SC_over_RRB
## dataset: Discovery
##                         vars  n   mean    sd median trimmed   mad   min    max
## dataset*                   1 78    NaN    NA     NA     NaN    NA   Inf   -Inf
## subgrp*                    2 78    NaN    NA     NA     NaN    NA   Inf   -Inf
## age                        3 78  16.28  5.14  15.96   16.12  5.11  7.56  29.40
## meanFD                     4 78   0.22  0.21   0.15    0.18  0.10  0.03   1.08
## viq_all                    5 76  97.81 19.26  99.00   97.59 19.27 61.00 142.00
## piq_all                    6 76  99.49 22.56 102.40  100.25 21.34 52.43 150.00
## fsiq4_all                  7 78  99.02 19.54 102.25   99.52 19.60 59.00 143.00
## A_pct_severity             8 78   0.42  0.14   0.43    0.41  0.13  0.12   0.82
## B_pct_severity             9 78   0.18  0.11   0.16    0.17  0.12  0.00   0.46
## ADI_social_total          10 78  17.60  6.32  18.00   17.91  7.41  3.00  28.00
## ADI_communication_total   11 78  14.13  5.03  14.50   14.28  5.19  2.00  24.00
## ADI_RRB_total             12 78   3.27  2.20   3.00    3.14  1.48  0.00  10.00
## ados_2_SA_CSS             13 76   6.34  2.55   7.00    6.48  2.97  1.00  10.00
## ados_2_RRB_CSS            14 76   4.59  2.77   5.00    4.50  2.97  1.00  10.00
## SRS_tscore                15 66  72.73 12.13  74.00   73.28 14.08 44.00  95.00
## SRS_tscore_self           16 31  62.94 12.19  61.00   61.72 10.38 42.00  94.00
## RBS_total                 17 65  17.20 16.86  15.00   14.58 16.31  0.00  90.00
## SSP_total                 18 49 138.29 30.55 139.00  139.22 37.06 53.00 189.00
## vabsdscoresc_dss          19 73  72.53 15.99  72.00   73.24 11.86 21.00 107.00
## vabsdscoresd_dss          20 73  72.11 17.06  73.00   71.68 14.83 17.00 131.00
## vabsdscoress_dss          21 73  68.18 15.73  69.00   69.37 13.34 20.00 104.00
## vabsabcabc_standard       22 73  68.81 14.97  71.00   69.54 10.38  6.00 103.00
##                          range  skew kurtosis   se
## dataset*                  -Inf    NA       NA   NA
## subgrp*                   -Inf    NA       NA   NA
## age                      21.85  0.29    -0.48 0.58
## meanFD                    1.05  2.28     5.30 0.02
## viq_all                  81.00  0.09    -0.57 2.21
## piq_all                  97.57 -0.29    -0.55 2.59
## fsiq4_all                84.00 -0.26    -0.84 2.21
## A_pct_severity            0.70  0.23    -0.22 0.02
## B_pct_severity            0.46  0.44    -0.53 0.01
## ADI_social_total         25.00 -0.35    -0.84 0.72
## ADI_communication_total  22.00 -0.22    -0.69 0.57
## ADI_RRB_total            10.00  0.63    -0.01 0.25
## ados_2_SA_CSS             9.00 -0.49    -0.86 0.29
## ados_2_RRB_CSS            9.00 -0.15    -1.21 0.32
## SRS_tscore               51.00 -0.29    -0.52 1.49
## SRS_tscore_self          52.00  0.82     0.34 2.19
## RBS_total                90.00  1.75     4.17 2.09
## SSP_total               136.00 -0.36    -0.40 4.36
## vabsdscoresc_dss         86.00 -0.77     2.14 1.87
## vabsdscoresd_dss        114.00  0.22     2.19 2.00
## vabsdscoress_dss         84.00 -0.82     1.15 1.84
## vabsabcabc_standard      97.00 -1.25     4.41 1.75
## ------------------------------------------------------------ 
## subgrp: TD
## dataset: Discovery
##                         vars   n   mean    sd median trimmed   mad    min
## dataset*                   1 121    NaN    NA     NA     NaN    NA    Inf
## subgrp*                    2 121    NaN    NA     NA     NaN    NA    Inf
## age                        3 121  16.83  5.23  16.65   16.73  5.69   7.22
## meanFD                     4 121   0.18  0.15   0.13    0.15  0.07   0.03
## viq_all                    5 119 104.52 19.70 105.00  105.03 19.27  46.00
## piq_all                    6 119 106.10 19.47 107.00  107.53 17.79  49.00
## fsiq4_all                  7 119 105.72 18.33 108.18  106.99 16.58  53.00
## A_pct_severity             8   0    NaN    NA     NA     NaN    NA    Inf
## B_pct_severity             9   0    NaN    NA     NA     NaN    NA    Inf
## ADI_social_total          10   0    NaN    NA     NA     NaN    NA    Inf
## ADI_communication_total   11   0    NaN    NA     NA     NaN    NA    Inf
## ADI_RRB_total             12   0    NaN    NA     NA     NaN    NA    Inf
## ados_2_SA_CSS             13   0    NaN    NA     NA     NaN    NA    Inf
## ados_2_RRB_CSS            14   0    NaN    NA     NA     NaN    NA    Inf
## SRS_tscore                15  68  47.84  9.40  45.00   46.32  5.19  37.00
## SRS_tscore_self           16  71  46.69  4.85  46.00   46.26  4.45  39.00
## RBS_total                 17  68   2.15  4.74   0.00    0.95  0.00   0.00
## SSP_total                 18  59 177.86 12.71 182.00  179.78  8.90 122.00
## vabsdscoresc_dss          19  34  91.97 25.44  99.50   93.50 21.50  21.00
## vabsdscoresd_dss          20  34  90.74 20.28  98.50   92.25 17.05  33.00
## vabsdscoress_dss          21  34  96.21 23.67 102.50   98.57 21.50  33.00
## vabsabcabc_standard       22  34  92.06 23.04  99.50   93.89 15.57  25.00
##                            max  range  skew kurtosis   se
## dataset*                  -Inf   -Inf    NA       NA   NA
## subgrp*                   -Inf   -Inf    NA       NA   NA
## age                      29.84  22.62  0.17    -0.51 0.48
## meanFD                    0.85   0.82  2.28     5.65 0.01
## viq_all                 160.00 114.00 -0.24     0.38 1.81
## piq_all                 147.00  98.00 -0.69     0.32 1.79
## fsiq4_all               142.00  89.00 -0.69     0.58 1.68
## A_pct_severity            -Inf   -Inf    NA       NA   NA
## B_pct_severity            -Inf   -Inf    NA       NA   NA
## ADI_social_total          -Inf   -Inf    NA       NA   NA
## ADI_communication_total   -Inf   -Inf    NA       NA   NA
## ADI_RRB_total             -Inf   -Inf    NA       NA   NA
## ados_2_SA_CSS             -Inf   -Inf    NA       NA   NA
## ados_2_RRB_CSS            -Inf   -Inf    NA       NA   NA
## SRS_tscore               76.00  39.00  1.54     1.44 1.14
## SRS_tscore_self          63.00  24.00  0.93     1.10 0.58
## RBS_total                27.00  27.00  3.13    10.87 0.57
## SSP_total               190.00  68.00 -1.90     4.85 1.66
## vabsdscoresc_dss        138.00 117.00 -0.71     0.36 4.36
## vabsdscoresd_dss        121.00  88.00 -0.80     0.07 3.48
## vabsdscoress_dss        129.00  96.00 -0.83    -0.31 4.06
## vabsabcabc_standard     127.00 102.00 -0.90     0.30 3.95
## ------------------------------------------------------------ 
## subgrp: RRB_over_SC
## dataset: Replication
##                         vars n   mean    sd median trimmed   mad    min    max
## dataset*                   1 7    NaN    NA     NA     NaN    NA    Inf   -Inf
## subgrp*                    2 7    NaN    NA     NA     NaN    NA    Inf   -Inf
## age                        3 7  13.79  3.76  12.61   13.79  3.32   8.31  19.56
## meanFD                     4 7   0.28  0.23   0.20    0.28  0.13   0.10   0.76
## viq_all                    5 7 106.49 17.25 102.73  106.49  7.82  91.00 143.00
## piq_all                    6 7 107.26 20.78 101.00  107.26 14.77  89.00 148.00
## fsiq4_all                  7 7 107.43 18.50 104.00  107.43  7.84  93.00 148.00
## A_pct_severity             8 7   0.15  0.07   0.16    0.15  0.02   0.04   0.27
## B_pct_severity             9 7   0.32  0.09   0.30    0.32  0.08   0.18   0.45
## ADI_social_total          10 7  14.71  5.22  16.00   14.71  4.45   5.00  20.00
## ADI_communication_total   11 7   8.57  3.46   9.00    8.57  2.97   3.00  13.00
## ADI_RRB_total             12 7   4.71  1.98   5.00    4.71  1.48   1.00   7.00
## ados_2_SA_CSS             13 7   5.14  2.79   5.00    5.14  4.45   1.00   8.00
## ados_2_RRB_CSS            14 7   4.57  2.57   5.00    4.57  2.97   1.00   7.00
## SRS_tscore                15 6  68.00 11.82  72.00   68.00  8.90  48.00  79.00
## SRS_tscore_self           16 2  64.50  3.54  64.50   64.50  3.71  62.00  67.00
## RBS_total                 17 5  14.60 10.01  13.00   14.60  8.90   5.00  30.00
## SSP_total                 18 4 138.50 11.15 137.50  138.50  9.64 126.00 153.00
## vabsdscoresc_dss          19 7  82.29 18.51  74.00   82.29  7.41  68.00 117.00
## vabsdscoresd_dss          20 7  73.43  6.60  71.00   73.43  4.45  66.00  85.00
## vabsdscoress_dss          21 7  83.00  9.56  82.00   83.00  8.90  67.00  95.00
## vabsabcabc_standard       22 7  72.71 17.02  77.00   72.71  5.93  39.00  94.00
##                         range  skew kurtosis   se
## dataset*                 -Inf    NA       NA   NA
## subgrp*                  -Inf    NA       NA   NA
## age                     11.26  0.14    -1.43 1.42
## meanFD                   0.66  1.15    -0.24 0.09
## viq_all                 52.00  1.19     0.00 6.52
## piq_all                 59.00  0.94    -0.72 7.85
## fsiq4_all               55.00  1.40     0.38 6.99
## A_pct_severity           0.23  0.06    -0.93 0.03
## B_pct_severity           0.27  0.07    -1.49 0.03
## ADI_social_total        15.00 -0.68    -1.03 1.97
## ADI_communication_total 10.00 -0.27    -1.53 1.31
## ADI_RRB_total            6.00 -0.66    -0.90 0.75
## ados_2_SA_CSS            7.00 -0.32    -1.74 1.06
## ados_2_RRB_CSS           6.00 -0.50    -1.70 0.97
## SRS_tscore              31.00 -0.64    -1.41 4.82
## SRS_tscore_self          5.00  0.00    -2.75 2.50
## RBS_total               25.00  0.47    -1.64 4.48
## SSP_total               27.00  0.19    -1.89 5.58
## vabsdscoresc_dss        49.00  0.89    -1.03 7.00
## vabsdscoresd_dss        19.00  0.56    -1.28 2.50
## vabsdscoress_dss        28.00 -0.31    -1.36 3.61
## vabsabcabc_standard     55.00 -0.81    -0.44 6.43
## ------------------------------------------------------------ 
## subgrp: SC_equal_RRB
## dataset: Replication
##                         vars  n   mean    sd median trimmed   mad   min    max
## dataset*                   1 44    NaN    NA     NA     NaN    NA   Inf   -Inf
## subgrp*                    2 44    NaN    NA     NA     NaN    NA   Inf   -Inf
## age                        3 44  16.88  6.45  16.42   16.59  7.37  7.12  30.15
## meanFD                     4 44   0.21  0.13   0.18    0.19  0.10  0.06   0.62
## viq_all                    5 44 102.27 16.10 103.00  102.86 18.98 70.00 133.00
## piq_all                    6 44 106.73 16.97 106.50  107.35 19.27 66.42 134.00
## fsiq4_all                  7 44 104.78 16.10 106.91  105.38 17.18 69.00 130.00
## A_pct_severity             8 44   0.25  0.12   0.23    0.24  0.11  0.04   0.65
## B_pct_severity             9 44   0.25  0.14   0.23    0.24  0.12  0.00   0.67
## ADI_social_total          10 44  14.52  6.73  14.50   14.67  7.41  1.00  27.00
## ADI_communication_total   11 44  11.07  5.33  11.00   11.14  5.93  0.00  21.00
## ADI_RRB_total             12 44   4.11  2.53   4.00    4.08  2.97  0.00   9.00
## ados_2_SA_CSS             13 43   5.47  2.55   6.00    5.51  2.97  1.00  10.00
## ados_2_RRB_CSS            14 43   4.74  2.47   5.00    4.71  1.48  1.00   9.00
## SRS_tscore                15 39  64.90 11.78  62.00   64.58 11.86 43.00  90.00
## SRS_tscore_self           16 23  60.74  8.11  61.00   60.74  7.41 46.00  79.00
## RBS_total                 17 38  14.18 11.93  11.50   12.78 11.12  0.00  52.00
## SSP_total                 18 25 142.72 28.08 148.00  145.19 28.17 69.00 177.00
## vabsdscoresc_dss          19 41  82.41 14.55  81.00   82.15 14.83 50.00 122.00
## vabsdscoresd_dss          20 41  79.88 16.29  79.00   79.21 14.83 38.00 119.00
## vabsdscoress_dss          21 41  77.95 15.31  80.00   78.82 16.31 30.00 101.00
## vabsabcabc_standard       22 41  79.32 13.31  78.00   78.82 10.38 48.00 117.00
##                          range  skew kurtosis   se
## dataset*                  -Inf    NA       NA   NA
## subgrp*                   -Inf    NA       NA   NA
## age                      23.03  0.31    -1.01 0.97
## meanFD                    0.56  1.24     1.15 0.02
## viq_all                  63.00 -0.22    -0.91 2.43
## piq_all                  67.58 -0.33    -0.69 2.56
## fsiq4_all                61.00 -0.28    -0.87 2.43
## A_pct_severity            0.61  0.72     1.12 0.02
## B_pct_severity            0.67  0.74     0.48 0.02
## ADI_social_total         26.00 -0.13    -0.93 1.01
## ADI_communication_total  21.00 -0.09    -0.82 0.80
## ADI_RRB_total             9.00  0.09    -0.81 0.38
## ados_2_SA_CSS             9.00 -0.17    -0.98 0.39
## ados_2_RRB_CSS            8.00 -0.37    -0.86 0.38
## SRS_tscore               47.00  0.24    -0.93 1.89
## SRS_tscore_self          33.00  0.06    -0.51 1.69
## RBS_total                52.00  1.26     1.64 1.94
## SSP_total               108.00 -0.82    -0.26 5.62
## vabsdscoresc_dss         72.00  0.29     0.10 2.27
## vabsdscoresd_dss         81.00  0.23     0.16 2.54
## vabsdscoress_dss         71.00 -0.61     0.61 2.39
## vabsabcabc_standard      69.00  0.43     0.33 2.08
## ------------------------------------------------------------ 
## subgrp: SC_over_RRB
## dataset: Replication
##                         vars  n   mean    sd median trimmed   mad   min    max
## dataset*                   1 82    NaN    NA     NA     NaN    NA   Inf   -Inf
## subgrp*                    2 82    NaN    NA     NA     NaN    NA   Inf   -Inf
## age                        3 82  16.36  5.02  15.93   16.20  5.75  7.48  29.23
## meanFD                     4 82   0.26  0.30   0.16    0.19  0.11  0.04   1.60
## viq_all                    5 78  96.85 19.43  99.00   97.91 20.76 50.91 130.00
## piq_all                    6 80  97.92 21.49 101.50   99.48 20.02 44.03 138.00
## fsiq4_all                  7 79  97.98 19.63 103.00   98.81 19.81 59.00 139.00
## A_pct_severity             8 82   0.44  0.15   0.43    0.44  0.17  0.13   0.75
## B_pct_severity             9 82   0.20  0.11   0.21    0.19  0.14  0.00   0.47
## ADI_social_total          10 82  17.88  5.74  19.00   18.06  5.93  4.00  29.00
## ADI_communication_total   11 82  14.72  5.02  15.00   14.85  5.93  3.00  24.00
## ADI_RRB_total             12 82   3.78  2.34   3.00    3.62  1.48  0.00  10.00
## ados_2_SA_CSS             13 78   6.14  2.73   6.00    6.22  2.97  1.00  10.00
## ados_2_RRB_CSS            14 78   4.74  2.77   5.00    4.67  2.97  1.00  10.00
## SRS_tscore                15 73  73.48 11.33  73.00   74.03 13.34 48.00  90.00
## SRS_tscore_self           16 38  62.92  9.59  61.50   62.72  5.93 40.00  84.00
## RBS_total                 17 72  17.53 14.53  13.00   15.76 11.12  0.00  73.00
## SSP_total                 18 54 138.17 25.37 138.50  138.57 28.17 90.00 184.00
## vabsdscoresc_dss          19 73  74.45 14.87  75.00   74.81 10.38 21.00 110.00
## vabsdscoresd_dss          20 72  71.62 15.97  69.00   71.21 15.57 42.00 118.00
## vabsdscoress_dss          21 73  66.56 15.93  68.00   67.00 13.34 23.00 112.00
## vabsabcabc_standard       22 72  69.07 13.42  69.00   69.47 11.12 28.00 107.00
##                         range  skew kurtosis   se
## dataset*                 -Inf    NA       NA   NA
## subgrp*                  -Inf    NA       NA   NA
## age                     21.75  0.33    -0.56 0.55
## meanFD                   1.55  3.13    10.11 0.03
## viq_all                 79.09 -0.47    -0.56 2.20
## piq_all                 93.97 -0.55    -0.58 2.40
## fsiq4_all               80.00 -0.38    -0.82 2.21
## A_pct_severity           0.62  0.08    -0.89 0.02
## B_pct_severity           0.47  0.27    -0.81 0.01
## ADI_social_total        25.00 -0.32    -0.61 0.63
## ADI_communication_total 21.00 -0.24    -0.83 0.55
## ADI_RRB_total           10.00  0.56    -0.37 0.26
## ados_2_SA_CSS            9.00 -0.14    -1.12 0.31
## ados_2_RRB_CSS           9.00 -0.21    -1.18 0.31
## SRS_tscore              42.00 -0.29    -0.91 1.33
## SRS_tscore_self         44.00  0.24     0.11 1.56
## RBS_total               73.00  1.28     1.65 1.71
## SSP_total               94.00 -0.14    -0.93 3.45
## vabsdscoresc_dss        89.00 -0.61     2.14 1.74
## vabsdscoresd_dss        76.00  0.41     0.01 1.88
## vabsdscoress_dss        89.00 -0.23     0.54 1.86
## vabsabcabc_standard     79.00 -0.35     1.26 1.58
## ------------------------------------------------------------ 
## subgrp: TD
## dataset: Replication
##                         vars   n   mean    sd median trimmed   mad   min    max
## dataset*                   1 122    NaN    NA     NA     NaN    NA   Inf   -Inf
## subgrp*                    2 122    NaN    NA     NA     NaN    NA   Inf   -Inf
## age                        3 122  16.86  6.07  16.34   16.58  7.59  6.89  29.72
## meanFD                     4 122   0.23  0.46   0.14    0.15  0.07  0.04   4.60
## viq_all                    5 122 104.02 17.58 108.18  105.64 12.13 45.00 140.00
## piq_all                    6 122 104.64 18.41 108.96  106.56 14.08 49.00 139.00
## fsiq4_all                  7 122 104.94 17.14 108.09  107.29 11.86 50.00 134.00
## A_pct_severity             8   0    NaN    NA     NA     NaN    NA   Inf   -Inf
## B_pct_severity             9   0    NaN    NA     NA     NaN    NA   Inf   -Inf
## ADI_social_total          10   0    NaN    NA     NA     NaN    NA   Inf   -Inf
## ADI_communication_total   11   0    NaN    NA     NA     NaN    NA   Inf   -Inf
## ADI_RRB_total             12   0    NaN    NA     NA     NaN    NA   Inf   -Inf
## ados_2_SA_CSS             13   0    NaN    NA     NA     NaN    NA   Inf   -Inf
## ados_2_RRB_CSS            14   0    NaN    NA     NA     NaN    NA   Inf   -Inf
## SRS_tscore                15  65  47.23  9.34  44.00   45.66  4.45 37.00  90.00
## SRS_tscore_self           16  61  48.44  6.84  47.00   47.63  5.93 39.00  69.00
## RBS_total                 17  63   3.08 11.54   0.00    0.86  0.00  0.00  89.00
## SSP_total                 18  54 174.93 19.38 182.00  178.41  6.67 75.00 190.00
## vabsdscoresc_dss          19  39  92.74 25.45  96.00   95.82 20.76 21.00 125.00
## vabsdscoresd_dss          20  39  91.10 22.65  97.00   93.91 14.83 27.00 122.00
## vabsdscoress_dss          21  39  98.90 27.04 103.00  102.12 17.79 20.00 132.00
## vabsabcabc_standard       22  38  93.00 25.39 100.00   96.44 15.57 20.00 126.00
##                          range  skew kurtosis   se
## dataset*                  -Inf    NA       NA   NA
## subgrp*                   -Inf    NA       NA   NA
## age                      22.83  0.33    -0.97 0.55
## meanFD                    4.56  7.54    64.83 0.04
## viq_all                  95.00 -0.99     1.51 1.59
## piq_all                  90.00 -0.93     0.67 1.67
## fsiq4_all                84.00 -1.28     1.67 1.55
## A_pct_severity            -Inf    NA       NA   NA
## B_pct_severity            -Inf    NA       NA   NA
## ADI_social_total          -Inf    NA       NA   NA
## ADI_communication_total   -Inf    NA       NA   NA
## ADI_RRB_total             -Inf    NA       NA   NA
## ados_2_SA_CSS             -Inf    NA       NA   NA
## ados_2_RRB_CSS            -Inf    NA       NA   NA
## SRS_tscore               53.00  2.14     5.82 1.16
## SRS_tscore_self          30.00  1.05     0.48 0.88
## RBS_total                89.00  6.60    45.89 1.45
## SSP_total               115.00 -2.88    10.92 2.64
## vabsdscoresc_dss        104.00 -1.21     1.00 4.08
## vabsdscoresd_dss         95.00 -1.34     1.26 3.63
## vabsdscoress_dss        112.00 -1.26     1.10 4.33
## vabsabcabc_standard     106.00 -1.42     1.40 4.12
#------------------------------------------------------------------------------
# Table of subtypes X sex

# Discovery
data2use = subset(data2write,data2write$dataset=="Discovery")
table(data2use$subgrp,data2use$sex)
##               
##                Female Male
##   RRB_over_SC       6   11
##   SC_equal_RRB      9   29
##   SC_over_RRB      20   58
##   TD               41   80
cs_res = chisq.test(data2use$subgrp,data2use$sex)
cs_res
## 
##  Pearson's Chi-squared test
## 
## data:  data2use$subgrp and data2use$sex
## X-squared = 2.5268, df = 3, p-value = 0.4705
# Replication
data2use = subset(data2write,data2write$dataset=="Replication")
table(data2use$subgrp,data2use$sex)
##               
##                Female Male
##   RRB_over_SC       1    6
##   SC_equal_RRB     12   32
##   SC_over_RRB      23   59
##   TD               47   75
cs_res = chisq.test(data2use$subgrp,data2use$sex)
cs_res
## 
##  Pearson's Chi-squared test
## 
## data:  data2use$subgrp and data2use$sex
## X-squared = 4.3621, df = 3, p-value = 0.2249
#------------------------------------------------------------------------------
vars2analyze = c("age","meanFD","viq_all","piq_all","fsiq4_all",
                 "A_pct_severity","B_pct_severity",
                 "ADI_social_total","ADI_communication_total","ADI_RRB_total",
                 "ados_2_SA_CSS","ados_2_RRB_CSS",
                 "SRS_tscore_self","RBS_total","SSP_total",
                 "vabsdscoress_dss","vabsdscoresd_dss","vabsdscoresc_dss","vabsabcabc_standard")

vnames = c("Age","Mean FD","VIQ","PIQ","FIQ","ADI-R SC","ADI-R RRB","ADI-R Social","ADI-R Communication","ADI-R RRB","ADOS SA CSS","ADOS RRB CSS","SRS","RBS","SSP","Vineland Socialization","Vineland Daily Living Skills","Vineland Communication","Vineland ABC")

cnames = c("All_Disc.fstat","All_Disc.pval","All_Rep.fstat","All_Rep.pval",
           "SCequalRRB_vs_SCoverRRB_Disc.fstat","SCequalRRB_vs_SCoverRRB_Disc.tstat",
           "SCequalRRB_vs_SCoverRRB_Disc.pval","SCequalRRB_vs_SCoverRRB_Disc.es",
           "SCequalRRB_vs_SCoverRRB_Rep.fstat","SCequalRRB_vs_SCoverRRB_Rep.tstat",
           "SCequalRRB_vs_SCoverRRB_Rep.pval","SCequalRRB_vs_SCoverRRB_Rep.es",
           "SCequalRRB_vs_SCoverRRB.repBF")

output_res = data.frame(matrix(nrow = length(vars2analyze),ncol = length(cnames)))
colnames(output_res) = cnames
rownames(output_res) = vars2analyze
output_res$varNames = vars2analyze

for (ivar in 1:length(vars2analyze)){

  y_var = vars2analyze[ivar]
  # print(y_var)
  #----------------------------------------------------------------------------
  # Discovery
  df4mod = subset(df2use, df2use$dataset=="Discovery")

  # construct linear model
  # mixed-effect model: site as random factor, all other covariates as fixed factors
  fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
  rx_form = as.formula(sprintf("~ 1|%s","Centre"))
  mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))

  # run ANOVA
  res = anova(mod2use)
  output_res[y_var,"All_Disc.fstat"] = res["subgrp","F-value"]
  output_res[y_var,"All_Disc.pval"] = res["subgrp","p-value"]

  #----------------------------------------------------------------------------
  # Replication
  df4mod = subset(df2use, df2use$dataset=="Replication")

  # construct linear model
  # mixed-effect model: site as random factor, all other covariates as fixed factors
  fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
  rx_form = as.formula(sprintf("~ 1|%s","Centre"))
  mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))

  # run ANOVA
  res = anova(mod2use)
  output_res[y_var,"All_Rep.fstat"] = res["subgrp","F-value"]
  output_res[y_var,"All_Rep.pval"] = res["subgrp","p-value"]

  #----------------------------------------------------------------------------
  # Discovery
  df4mod = subset(df2use, df2use$dataset=="Discovery" & !is.element(df2use$subgrp,c("TD","RRB_over_SC")))
  n1 = sum(df4mod$subgrp=="SC_equal_RRB")
  m1 = sum(df4mod$subgrp=="SC_over_RRB")

  # construct linear model
  # mixed-effect model: site as random factor, all other covariates as fixed factors
  fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
  rx_form = as.formula(sprintf("~ 1|%s","Centre"))
  mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))

  # run ANOVA
  res = anova(mod2use)
  output_res[y_var,"SCequalRRB_vs_SCoverRRB_Disc.fstat"] = res["subgrp","F-value"]
  output_res[y_var,"SCequalRRB_vs_SCoverRRB_Disc.pval"] = res["subgrp","p-value"]
  res = summary(mod2use)
  output_res[y_var,"SCequalRRB_vs_SCoverRRB_Disc.tstat"] = res$tTable[2,"t-value"]
  output_res[y_var,"SCequalRRB_vs_SCoverRRB_Disc.es"] = cohens_d(df4mod[df4mod$subgrp=="SC_equal_RRB",y_var],
                                                                 df4mod[df4mod$subgrp=="SC_over_RRB",y_var])

  #----------------------------------------------------------------------------
  # Replication
  df4mod = subset(df2use, df2use$dataset=="Replication" & !is.element(df2use$subgrp,c("TD","RRB_over_SC")))
  n2 = sum(df4mod$subgrp=="SC_equal_RRB")
  m2 = sum(df4mod$subgrp=="SC_over_RRB")

  # construct linear model
  # mixed-effect model: site as random factor, all other covariates as fixed factors
  fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
  rx_form = as.formula(sprintf("~ 1|%s","Centre"))
  mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))

  # run ANOVA
  res = anova(mod2use)
  output_res[y_var,"SCequalRRB_vs_SCoverRRB_Rep.fstat"] = res["subgrp","F-value"]
  output_res[y_var,"SCequalRRB_vs_SCoverRRB_Rep.pval"] = res["subgrp","p-value"]
  res = summary(mod2use)
  output_res[y_var,"SCequalRRB_vs_SCoverRRB_Rep.tstat"] = res$tTable[2,"t-value"]
  output_res[y_var,"SCequalRRB_vs_SCoverRRB_Rep.es"] = cohens_d(df4mod[df4mod$subgrp=="SC_equal_RRB",y_var],
                                                                 df4mod[df4mod$subgrp=="SC_over_RRB",y_var])

  #----------------------------------------------------------------------------
  # Replication Bayes Factor
  res_bf = BFSALL(tobs = output_res[y_var,"SCequalRRB_vs_SCoverRRB_Disc.tstat"],
                  trep = output_res[y_var,"SCequalRRB_vs_SCoverRRB_Disc.tstat"],
                  n1 = n1,
                  n2 = n2,
                  m1 = m1,
                  m2 = m2,
                  sample = 2,
                  Type = 'ALL')
  output_res[y_var,"SCequalRRB_vs_SCoverRRB.repBF"] = res_bf["Replication BF","Replication 1"]

  # make a plot
  colors2use = get_ggColorHue(3)
  df4plot = subset(df2use, df2use$subgrp!="RRB_over_SC")
  p = ggplot(data = df4plot, aes_string(x = "subgrp", y = y_var, colour = "subgrp")) + facet_grid(. ~ dataset)
  p = p + geom_jitter() + geom_boxplot(fill = NA, colour = "#000000", outlier.shape = NA)
  p = p + ylab(vnames[ivar]) + xlab("Group") +
    scale_x_discrete(labels=c("SC_equal_RRB"="SC=RRB","SC_over_RRB"="SC>RRB","TD"="TD")) +
                       scale_colour_manual(values = c(colors2use[2:3],"grey60")) + guides(colour=FALSE) +
    theme(text = element_text(size=fontSize-5),
        axis.text.x = element_text(size=fontSize-5),
        axis.text.y = element_text(size=fontSize-5))
  print(p)

}

vabc = data.frame(matrix(nrow=6,ncol=2))
colnames(vabc) = c("Discovery","Replication")
rownames(vabc) = c("0.5","0.6","0.7","0.8","0.9","1")
vabc["0.5","Discovery"] = output_res["vabsabcabc_standard","SCequalRRB_vs_SCoverRRB_Disc.es"]
vabc["0.5","Replication"] = output_res["vabsabcabc_standard","SCequalRRB_vs_SCoverRRB_Rep.es"]

vabc_dls = data.frame(matrix(nrow=6,ncol=2))
colnames(vabc_dls) = c("Discovery","Replication")
rownames(vabc_dls) = c("0.5","0.6","0.7","0.8","0.9","1")
vabc_dls["0.5","Discovery"] = output_res["vabsdscoresd_dss","SCequalRRB_vs_SCoverRRB_Disc.es"]
vabc_dls["0.5","Replication"] = output_res["vabsdscoresd_dss","SCequalRRB_vs_SCoverRRB_Rep.es"]
output_res
##                         All_Disc.fstat All_Disc.pval All_Rep.fstat All_Rep.pval
## age                          0.2573919  8.560255e-01    0.99073365 3.977672e-01
## meanFD                       3.7568185  1.149997e-02    0.24909122 8.619423e-01
## viq_all                      2.0602422  1.061159e-01    1.91036720 1.284730e-01
## piq_all                      1.6442358  1.797895e-01    1.59078093 1.921753e-01
## fsiq4_all                    2.2680322  8.121156e-02    1.65268759 1.778719e-01
## A_pct_severity              27.8532657  9.335821e-11   36.83073911 2.426948e-13
## B_pct_severity              31.1111186  1.008793e-11    4.96019221 8.429684e-03
## ADI_social_total             1.9132206  1.518344e-01    5.35188550 5.867807e-03
## ADI_communication_total      0.7525201  4.732668e-01   11.76233584 2.058013e-05
## ADI_RRB_total               26.0459642  3.320546e-10    0.54838564 5.792441e-01
## ados_2_SA_CSS                2.6948807  7.150723e-02    1.55852306 2.146072e-01
## ados_2_RRB_CSS               0.7451863  4.767593e-01    0.09256188 9.116568e-01
## SRS_tscore_self             36.6468450  0.000000e+00   32.85687549 1.887379e-15
## RBS_total                   18.4726404  2.023459e-10   12.69506727 1.580707e-07
## SSP_total                   30.3736907  5.995204e-15   22.94434995 5.732526e-12
## vabsdscoress_dss            22.8844511  2.963851e-12   24.37737030 5.986323e-13
## vabsdscoresd_dss            11.9970970  4.428882e-07    9.41186882 9.619700e-06
## vabsdscoresc_dss             8.9064742  1.810010e-05    7.87815546 6.373651e-05
## vabsabcabc_standard         17.4757273  8.952938e-10   15.73645152 5.898297e-09
##                         SCequalRRB_vs_SCoverRRB_Disc.fstat
## age                                            0.023737296
## meanFD                                         3.030459423
## viq_all                                        0.029671461
## piq_all                                        0.063182446
## fsiq4_all                                      0.009116035
## A_pct_severity                                21.687217221
## B_pct_severity                                39.468478364
## ADI_social_total                               0.149703080
## ADI_communication_total                        0.344130518
## ADI_RRB_total                                 25.836285643
## ados_2_SA_CSS                                  0.003129000
## ados_2_RRB_CSS                                 1.700635530
## SRS_tscore_self                                1.128509387
## RBS_total                                      0.071971426
## SSP_total                                      0.274616524
## vabsdscoress_dss                               2.202237362
## vabsdscoresd_dss                               0.143787967
## vabsdscoresc_dss                               0.003264895
## vabsabcabc_standard                            0.200518258
##                         SCequalRRB_vs_SCoverRRB_Disc.tstat
## age                                            -0.15406913
## meanFD                                         -1.74082148
## viq_all                                        -0.17225406
## piq_all                                        -0.25136119
## fsiq4_all                                      -0.09547793
## A_pct_severity                                  4.65695364
## B_pct_severity                                 -6.28239432
## ADI_social_total                                0.38691482
## ADI_communication_total                         0.58662639
## ADI_RRB_total                                  -5.08294065
## ados_2_SA_CSS                                   0.05593746
## ados_2_RRB_CSS                                 -1.30408417
## SRS_tscore_self                                 1.06231322
## RBS_total                                       0.26827491
## SSP_total                                       0.52403867
## vabsdscoress_dss                               -1.48399372
## vabsdscoresd_dss                               -0.37919384
## vabsdscoresc_dss                                0.05713926
## vabsabcabc_standard                            -0.44779265
##                         SCequalRRB_vs_SCoverRRB_Disc.pval
## age                                          8.778350e-01
## meanFD                                       8.448583e-02
## viq_all                                      8.635572e-01
## piq_all                                      8.020081e-01
## fsiq4_all                                    9.241074e-01
## A_pct_severity                               8.957158e-06
## B_pct_severity                               6.696983e-09
## ADI_social_total                             6.995601e-01
## ADI_communication_total                      5.586458e-01
## ADI_RRB_total                                1.518247e-06
## ados_2_SA_CSS                                9.554950e-01
## ados_2_RRB_CSS                               1.949775e-01
## SRS_tscore_self                              2.934092e-01
## RBS_total                                    7.890756e-01
## SSP_total                                    6.019810e-01
## vabsdscoress_dss                             1.408063e-01
## vabsdscoresd_dss                             7.053170e-01
## vabsdscoresc_dss                             9.545429e-01
## vabsabcabc_standard                          6.552342e-01
##                         SCequalRRB_vs_SCoverRRB_Disc.es
## age                                         0.028844334
## meanFD                                      0.344384924
## viq_all                                     0.008856869
## piq_all                                     0.069781157
## fsiq4_all                                   0.020006875
## A_pct_severity                             -0.715775439
## B_pct_severity                              1.216289079
## ADI_social_total                           -0.007511102
## ADI_communication_total                     0.054103075
## ADI_RRB_total                               1.041784220
## ados_2_SA_CSS                               0.014204764
## ados_2_RRB_CSS                              0.241170234
## SRS_tscore_self                            -0.088107447
## RBS_total                                  -0.008937980
## SSP_total                                  -0.127843143
## vabsdscoress_dss                            0.214368240
## vabsdscoresd_dss                            0.041627724
## vabsdscoresc_dss                           -0.017163951
## vabsabcabc_standard                         0.082515990
##                         SCequalRRB_vs_SCoverRRB_Rep.fstat
## age                                            0.24570362
## meanFD                                         0.99663587
## viq_all                                        1.72401230
## piq_all                                        3.58574024
## fsiq4_all                                      2.47507676
## A_pct_severity                                55.22952665
## B_pct_severity                                 5.50174307
## ADI_social_total                               9.47374889
## ADI_communication_total                       15.98406931
## ADI_RRB_total                                  0.55594738
## ados_2_SA_CSS                                  1.88367323
## ados_2_RRB_CSS                                 0.24422396
## SRS_tscore_self                                0.82958946
## RBS_total                                      1.15143630
## SSP_total                                      0.05281376
## vabsdscoress_dss                              14.90205172
## vabsdscoresd_dss                               7.04076224
## vabsdscoresc_dss                               7.64829090
## vabsabcabc_standard                           17.25615787
##                         SCequalRRB_vs_SCoverRRB_Rep.tstat
## age                                            -0.4956850
## meanFD                                          0.9983165
## viq_all                                        -1.3130165
## piq_all                                        -1.8936051
## fsiq4_all                                      -1.5732377
## A_pct_severity                                  7.4316571
## B_pct_severity                                 -2.3455795
## ADI_social_total                                3.0779456
## ADI_communication_total                         3.9980082
## ADI_RRB_total                                  -0.7456188
## ados_2_SA_CSS                                   1.3724698
## ados_2_RRB_CSS                                 -0.4941902
## SRS_tscore_self                                 0.9108180
## RBS_total                                       1.0730500
## SSP_total                                      -0.2298125
## vabsdscoress_dss                               -3.8603176
## vabsdscoresd_dss                               -2.6534435
## vabsdscoresc_dss                               -2.7655544
## vabsabcabc_standard                            -4.1540532
##                         SCequalRRB_vs_SCoverRRB_Rep.pval
## age                                         6.210158e-01
## meanFD                                      3.201182e-01
## viq_all                                     1.917468e-01
## piq_all                                     6.070556e-02
## fsiq4_all                                   1.183413e-01
## A_pct_severity                              1.678557e-11
## B_pct_severity                              2.062474e-02
## ADI_social_total                            2.578856e-03
## ADI_communication_total                     1.102988e-04
## ADI_RRB_total                               4.573437e-01
## ados_2_SA_CSS                               1.725652e-01
## ados_2_RRB_CSS                              6.221064e-01
## SRS_tscore_self                             3.662953e-01
## RBS_total                                   2.857086e-01
## SSP_total                                   8.188716e-01
## vabsdscoress_dss                            1.923479e-04
## vabsdscoresd_dss                            9.169900e-03
## vabsdscoresc_dss                            6.675582e-03
## vabsabcabc_standard                         6.552828e-05
##                         SCequalRRB_vs_SCoverRRB_Rep.es
## age                                        0.092631311
## meanFD                                    -0.186560744
## viq_all                                    0.295813936
## piq_all                                    0.439664548
## fsiq4_all                                  0.367904628
## A_pct_severity                            -1.350293936
## B_pct_severity                             0.431758294
## ADI_social_total                          -0.549950345
## ADI_communication_total                   -0.711319192
## ADI_RRB_total                              0.138401647
## ados_2_SA_CSS                             -0.253070996
## ados_2_RRB_CSS                             0.000223351
## SRS_tscore_self                           -0.239627574
## RBS_total                                 -0.244260593
## SSP_total                                  0.172842856
## vabsdscoress_dss                           0.724827207
## vabsdscoresd_dss                           0.513171882
## vabsdscoresc_dss                           0.539616008
## vabsabcabc_standard                        0.765845685
##                         SCequalRRB_vs_SCoverRRB.repBF                varNames
## age                                      6.952307e-01                     age
## meanFD                                   3.075536e+00                  meanFD
## viq_all                                  6.969642e-01                 viq_all
## piq_all                                  7.088276e-01                 piq_all
## fsiq4_all                                6.897484e-01               fsiq4_all
## A_pct_severity                           1.538184e+04          A_pct_severity
## B_pct_severity                           1.957605e+07          B_pct_severity
## ADI_social_total                         7.401322e-01        ADI_social_total
## ADI_communication_total                  8.157931e-01 ADI_communication_total
## ADI_RRB_total                            8.768287e+04           ADI_RRB_total
## ados_2_SA_CSS                            6.886466e-01           ados_2_SA_CSS
## ados_2_RRB_CSS                           1.599530e+00          ados_2_RRB_CSS
## SRS_tscore_self                          1.207569e+00         SRS_tscore_self
## RBS_total                                7.121920e-01               RBS_total
## SSP_total                                7.875919e-01               SSP_total
## vabsdscoress_dss                         2.050609e+00        vabsdscoress_dss
## vabsdscoresd_dss                         7.381464e-01        vabsdscoresd_dss
## vabsdscoresc_dss                         6.876964e-01        vabsdscoresc_dss
## vabsabcabc_standard                      7.592313e-01     vabsabcabc_standard

SC-RRB difference z = 0.6

#------------------------------------------------------------------------------
# Z-score threshold to use for subtyping
z_thresh = 0.6

# vars2use = c("dbaes_atotal","dbaes_btotal")

# compute Discovery mean and sd
ds_disc = Dverbal_Discovery[,vars2use[1]] - Dverbal_Discovery[,vars2use[2]]
mean2use = mean(ds_disc)
sd2use = sd(ds_disc)

Dverbal_Discovery = make_subtype(data2use = Dverbal_Discovery,
                                 z_thresh = z_thresh,
                                 mean2use = mean2use,
                                 sd2use = sd2use)

# compute Replication mean and sd
ds_rep = Dverbal_Replication[,vars2use[1]] - Dverbal_Replication[,vars2use[2]]
mean2use = mean(ds_rep)
sd2use = sd(ds_rep)

Dverbal_Replication = make_subtype(data2use = Dverbal_Replication,
                                 z_thresh = z_thresh,
                                 mean2use = mean2use,
                                 sd2use = sd2use)

#------------------------------------------------------------------------------
# Table of subtypes X sex

# Discovery
table(Dverbal_Discovery$z_ds_group,Dverbal_Discovery$sex)
##               
##                  F   M
##   RRB_over_SC   51 197
##   SC_equal_RRB  95 310
##   SC_over_RRB   51 185
cs_res = chisq.test(Dverbal_Discovery$z_ds_group,Dverbal_Discovery$sex)
cs_res
## 
##  Pearson's Chi-squared test
## 
## data:  Dverbal_Discovery$z_ds_group and Dverbal_Discovery$sex
## X-squared = 0.80219, df = 2, p-value = 0.6696
# Replication
table(Dverbal_Replication$z_ds_group,Dverbal_Replication$sex)
##               
##                  F   M
##   RRB_over_SC   48 188
##   SC_equal_RRB 102 330
##   SC_over_RRB   46 176
cs_res = chisq.test(Dverbal_Replication$z_ds_group,Dverbal_Replication$sex)
cs_res
## 
##  Pearson's Chi-squared test
## 
## data:  Dverbal_Replication$z_ds_group and Dverbal_Replication$sex
## X-squared = 1.2434, df = 2, p-value = 0.537
#------------------------------------------------------------------------------
# Descriptive stats

# Discovery
df2use = Dverbal_Discovery[,c("subjectkey","dataset_id","collection_id","interview_age","sex","z_ds_group","ados_age","ados_sa_css","ados_rrb_css","iq","dbaes_atotal","dbaes_btotal")]
df2use$age = df2use$interview_age/12

cols2use = c("z_ds_group","sex","age","ados_age","ados_sa_css","ados_rrb_css","iq","dbaes_atotal","dbaes_btotal")
res=describeBy(x = df2use[,cols2use], group = "z_ds_group")
res
## 
##  Descriptive statistics by group 
## group: RRB_over_SC
##              vars   n   mean    sd median trimmed   mad   min    max  range
## z_ds_group*     1 248   1.00  0.00   1.00    1.00  0.00  1.00   1.00   0.00
## sex*            2 248    NaN    NA     NA     NaN    NA   Inf   -Inf   -Inf
## age             3 248  10.02  4.19   9.50    9.67  3.46  2.00  27.17  25.17
## ados_age        4  34  98.76 43.79  89.00   97.07 50.41 27.00 202.00 175.00
## ados_sa_css     5  34   6.53  2.38   7.00    6.61  2.97  2.00  10.00   8.00
## ados_rrb_css    6  34   7.65  1.87   8.00    7.86  1.48  1.00  10.00   9.00
## iq              7  66 102.45 17.49 105.00  103.67 15.57 53.00 139.00  86.00
## dbaes_atotal    8 248   0.22  0.11   0.22    0.22  0.12  0.00   0.51   0.51
## dbaes_btotal    9 248   0.46  0.13   0.45    0.46  0.13  0.14   0.79   0.65
##               skew kurtosis   se
## z_ds_group*    NaN      NaN 0.00
## sex*            NA       NA   NA
## age           0.93     1.26 0.27
## ados_age      0.40    -0.79 7.51
## ados_sa_css  -0.29    -1.12 0.41
## ados_rrb_css -1.41     2.70 0.32
## iq           -0.68     0.52 2.15
## dbaes_atotal  0.11    -0.62 0.01
## dbaes_btotal  0.08    -0.36 0.01
## ------------------------------------------------------------ 
## group: SC_equal_RRB
##              vars   n   mean    sd median trimmed   mad min    max  range  skew
## z_ds_group*     1 405   2.00  0.00   2.00    2.00  0.00   2   2.00   0.00   NaN
## sex*            2 405    NaN    NA     NA     NaN    NA Inf   -Inf   -Inf    NA
## age             3 405   9.17  5.79   8.08    8.34  4.69   0  45.75  45.75  2.05
## ados_age        4  69  77.71 42.10  61.00   72.93 34.10  33 182.00 149.00  0.83
## ados_sa_css     5  69   6.84  2.04   7.00    6.95  1.48   1  10.00   9.00 -0.41
## ados_rrb_css    6  69   7.83  2.23   8.00    8.18  1.48   1  10.00   9.00 -1.59
## iq              7 101 104.53 18.27 106.00  105.68 19.27  42 138.00  96.00 -0.75
## dbaes_atotal    8 405   0.30  0.14   0.30    0.30  0.14   0   0.67   0.67 -0.02
## dbaes_btotal    9 405   0.31  0.13   0.31    0.32  0.14   0   0.68   0.68 -0.14
##              kurtosis   se
## z_ds_group*       NaN 0.00
## sex*               NA   NA
## age              6.77 0.29
## ados_age        -0.48 5.07
## ados_sa_css     -0.01 0.25
## ados_rrb_css     2.50 0.27
## iq               1.03 1.82
## dbaes_atotal    -0.37 0.01
## dbaes_btotal    -0.21 0.01
## ------------------------------------------------------------ 
## group: SC_over_RRB
##              vars   n   mean    sd median trimmed   mad   min    max  range
## z_ds_group*     1 236   3.00  0.00   3.00    3.00  0.00  3.00   3.00   0.00
## sex*            2 236    NaN    NA     NA     NaN    NA   Inf   -Inf   -Inf
## age             3 236   7.31  4.96   5.83    6.46  2.97  1.67  37.33  35.67
## ados_age        4  50  69.88 36.57  64.50   63.25 27.43 30.00 172.00 142.00
## ados_sa_css     5  50   7.36  1.65   7.00    7.40  1.48  4.00  10.00   6.00
## ados_rrb_css    6  50   7.86  2.19   8.00    8.22  1.48  1.00  10.00   9.00
## iq              7  32 105.00 18.45 111.00  106.15 13.34 40.00 140.00 100.00
## dbaes_atotal    8 236   0.46  0.14   0.46    0.46  0.14  0.11   0.87   0.76
## dbaes_btotal    9 236   0.23  0.11   0.22    0.22  0.10  0.00   0.57   0.57
##               skew kurtosis   se
## z_ds_group*    NaN      NaN 0.00
## sex*            NA       NA   NA
## age           2.36     7.77 0.32
## ados_age      1.42     1.38 5.17
## ados_sa_css  -0.14    -0.71 0.23
## ados_rrb_css -1.69     3.15 0.31
## iq           -1.17     2.72 3.26
## dbaes_atotal  0.17    -0.19 0.01
## dbaes_btotal  0.28    -0.07 0.01
# Replication
df2use = Dverbal_Replication[,c("subjectkey","dataset_id","collection_id","interview_age","sex","z_ds_group","ados_age","ados_sa_css","ados_rrb_css","iq","dbaes_atotal","dbaes_btotal")]
df2use$age = df2use$interview_age/12

cols2use = c("z_ds_group","sex","age","ados_age","ados_sa_css","ados_rrb_css","iq","dbaes_atotal","dbaes_btotal")
res=describeBy(x = df2use[,cols2use], group = "z_ds_group")
res
## 
##  Descriptive statistics by group 
## group: RRB_over_SC
##              vars   n   mean    sd median trimmed   mad   min    max  range
## z_ds_group*     1 236   1.00  0.00   1.00    1.00  0.00  1.00   1.00   0.00
## sex*            2 236    NaN    NA     NA     NaN    NA   Inf   -Inf   -Inf
## age             3 236   9.83  4.75   9.29    9.34  4.26  3.00  28.58  25.58
## ados_age        4  24  88.00 50.54  76.50   82.75 56.34 36.00 196.00 160.00
## ados_sa_css     5  24   6.50  1.91   7.00    6.55  1.48  3.00  10.00   7.00
## ados_rrb_css    6  24   7.67  1.52   7.50    7.70  1.48  5.00  10.00   5.00
## iq              7  71 102.72 18.36 104.00  103.26 17.79 57.00 152.00  95.00
## dbaes_atotal    8 236   0.23  0.11   0.23    0.22  0.10  0.01   0.61   0.60
## dbaes_btotal    9 236   0.48  0.13   0.47    0.47  0.11  0.21   0.93   0.72
##               skew kurtosis    se
## z_ds_group*    NaN      NaN  0.00
## sex*            NA       NA    NA
## age           1.21     2.10  0.31
## ados_age      0.69    -0.76 10.32
## ados_sa_css  -0.16    -0.89  0.39
## ados_rrb_css -0.16    -0.94  0.31
## iq           -0.21     0.40  2.18
## dbaes_atotal  0.62     0.73  0.01
## dbaes_btotal  0.53     0.74  0.01
## ------------------------------------------------------------ 
## group: SC_equal_RRB
##              vars   n   mean    sd median trimmed   mad min    max  range  skew
## z_ds_group*     1 432   2.00  0.00   2.00    2.00  0.00   2   2.00   0.00   NaN
## sex*            2 432    NaN    NA     NA     NaN    NA Inf   -Inf   -Inf    NA
## age             3 432   8.79  5.20   7.83    8.05  4.45   0  33.83  33.83  1.71
## ados_age        4  82  80.21 38.11  71.50   76.18 41.51  35 188.00 153.00  0.74
## ados_sa_css     5  82   6.90  2.15   7.00    6.97  2.97   2  10.00   8.00 -0.12
## ados_rrb_css    6  82   7.21  2.56   8.00    7.62  1.48   1  10.00   9.00 -1.22
## iq              7  80 108.08 15.91 108.00  107.80 13.34  64 146.00  82.00  0.06
## dbaes_atotal    8 432   0.31  0.14   0.30    0.31  0.14   0   0.74   0.74 -0.02
## dbaes_btotal    9 432   0.32  0.14   0.32    0.32  0.14   0   0.81   0.81  0.07
##              kurtosis   se
## z_ds_group*       NaN 0.00
## sex*               NA   NA
## age              4.13 0.25
## ados_age        -0.31 4.21
## ados_sa_css     -0.93 0.24
## ados_rrb_css     0.72 0.28
## iq               0.19 1.78
## dbaes_atotal    -0.27 0.01
## dbaes_btotal     0.05 0.01
## ------------------------------------------------------------ 
## group: SC_over_RRB
##              vars   n   mean    sd median trimmed   mad   min    max  range
## z_ds_group*     1 222   3.00  0.00   3.00    3.00  0.00  3.00   3.00   0.00
## sex*            2 222    NaN    NA     NA     NaN    NA   Inf   -Inf   -Inf
## age             3 222   8.10  6.17   6.38    6.88  3.77  0.00  40.92  40.92
## ados_age        4  47  71.87 28.18  69.00   68.85 25.20 30.00 141.00 111.00
## ados_sa_css     5  47   7.17  1.75   7.00    7.18  1.48  3.00  10.00   7.00
## ados_rrb_css    6  47   7.49  2.14   8.00    7.69  1.48  1.00  10.00   9.00
## iq              7  35 110.37 18.04 111.00  111.07 17.79 62.00 146.00  84.00
## dbaes_atotal    8 222   0.48  0.13   0.48    0.47  0.13  0.14   0.96   0.82
## dbaes_btotal    9 222   0.24  0.12   0.24    0.24  0.12  0.00   0.66   0.66
##               skew kurtosis   se
## z_ds_group*    NaN      NaN 0.00
## sex*            NA       NA   NA
## age           2.16     5.47 0.41
## ados_age      0.94     0.23 4.11
## ados_sa_css   0.06    -0.76 0.26
## ados_rrb_css -1.07     1.30 0.31
## iq           -0.47    -0.20 3.05
## dbaes_atotal  0.39     0.20 0.01
## dbaes_btotal  0.23     0.03 0.01
#------------------------------------------------------------------------------
# Tests of differences in interview age across the subtypes

# Discovery
mod2use = lm(formula = interview_age ~ z_ds_group, data = Dverbal_Discovery)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: interview_age
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## z_ds_group   2  134446   67223  17.478 3.591e-08 ***
## Residuals  886 3407620    3846                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Replication
mod2use = lm(formula = interview_age ~ z_ds_group, data = Dverbal_Replication)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: interview_age
##             Df  Sum Sq Mean Sq F value   Pr(>F)   
## z_ds_group   2   50929 25464.7  6.1841 0.002152 **
## Residuals  887 3652435  4117.7                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#------------------------------------------------------------------------------
# Tests of differences in SC across the subtypes

# Discovery
mod2use = lm(formula = dbaes_atotal ~ z_ds_group, data = Dverbal_Discovery)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: dbaes_atotal
##             Df Sum Sq Mean Sq F value    Pr(>F)    
## z_ds_group   2  7.017  3.5085   206.5 < 2.2e-16 ***
## Residuals  886 15.054  0.0170                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Replication
mod2use = lm(formula = dbaes_atotal ~ z_ds_group, data = Dverbal_Replication)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: dbaes_atotal
##             Df Sum Sq Mean Sq F value    Pr(>F)    
## z_ds_group   2  7.426   3.713  218.26 < 2.2e-16 ***
## Residuals  887 15.089   0.017                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#------------------------------------------------------------------------------
# Tests of differences in RRB across the subtypes

# Discovery
mod2use = lm(formula = dbaes_btotal ~ z_ds_group, data = Dverbal_Discovery)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: dbaes_btotal
##             Df Sum Sq Mean Sq F value    Pr(>F)    
## z_ds_group   2  6.598  3.2990  212.94 < 2.2e-16 ***
## Residuals  886 13.727  0.0155                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Replication
mod2use = lm(formula = dbaes_btotal ~ z_ds_group, data = Dverbal_Replication)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: dbaes_btotal
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## z_ds_group   2  6.8743  3.4371  197.27 < 2.2e-16 ***
## Residuals  887 15.4543  0.0174                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#------------------------------------------------------------------------------
# Tests of differences in ADOS SA CSS across the subtypes

# Discovery
mod2use = lm(formula = ados_sa_css ~ z_ds_group, data = Dverbal_Discovery)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: ados_sa_css
##             Df Sum Sq Mean Sq F value Pr(>F)
## z_ds_group   2  15.23  7.6168   1.894 0.1541
## Residuals  150 603.24  4.0216
# Replication
mod2use = lm(formula = ados_sa_css ~ z_ds_group, data = Dverbal_Replication)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: ados_sa_css
##             Df Sum Sq Mean Sq F value Pr(>F)
## z_ds_group   2   7.20  3.6005  0.9003 0.4086
## Residuals  150 599.86  3.9991
#------------------------------------------------------------------------------
# Tests of differences in ADOS RRB CSS across the subtypes

# Discovery
mod2use = lm(formula = ados_rrb_css ~ z_ds_group, data = Dverbal_Discovery)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: ados_rrb_css
##             Df Sum Sq Mean Sq F value Pr(>F)
## z_ds_group   2   1.02  0.5106  0.1114 0.8947
## Residuals  150 687.70  4.5847
# Replication
mod2use = lm(formula = ados_rrb_css ~ z_ds_group, data = Dverbal_Replication)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: ados_rrb_css
##             Df Sum Sq Mean Sq F value Pr(>F)
## z_ds_group   2   4.95  2.4748  0.4684 0.6269
## Residuals  150 792.55  5.2837
#------------------------------------------------------------------------------
# Tests of differences in IQ across the subtypes

# Discovery
mod2use = lm(formula = iq ~ z_ds_group, data = Dverbal_Discovery)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: iq
##             Df Sum Sq Mean Sq F value Pr(>F)
## z_ds_group   2    217  108.61  0.3336 0.7167
## Residuals  196  63811  325.57
# Replication
mod2use = lm(formula = iq ~ z_ds_group, data = Dverbal_Replication)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: iq
##             Df Sum Sq Mean Sq F value  Pr(>F)  
## z_ds_group   2   1738  869.07  2.9092 0.05705 .
## Residuals  183  54668  298.73                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#------------------------------------------------------------------------------
# Make scatterplots with difference score Z subtypes in different colors
maxScores = c(3,4)

p_disc = ggplot(data = Dverbal_Discovery, aes(x = dbaes_atotal, y = dbaes_btotal, colour = z_ds_group)) + geom_point() + xlab("SC") + ylab("RRB") + ylim(0,1) + xlim(0,1) + ggtitle("NDAR Discovery")
p1_top_left = p_disc + guides(colour=FALSE)
ggsave(filename = file.path(plotpath, sprintf("final_NDAR_Disc_scatterplot_z%s.pdf",as.character(z_thresh))), plot = p1_top_left)
p_disc

table(Dverbal_Discovery$z_ds_group)
## 
##  RRB_over_SC SC_equal_RRB  SC_over_RRB 
##          248          405          236
cor_res = cor.test(Dverbal_Discovery$dbaes_atotal, Dverbal_Discovery$dbaes_btotal)
cor_res
## 
##  Pearson's product-moment correlation
## 
## data:  Dverbal_Discovery$dbaes_atotal and Dverbal_Discovery$dbaes_btotal
## t = 6.6829, df = 887, p-value = 4.134e-11
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1554332 0.2806578
## sample estimates:
##       cor 
## 0.2189469
p_rep = ggplot(data = Dverbal_Replication, aes(x = dbaes_atotal, y = dbaes_btotal, colour = z_ds_group)) + geom_point() + xlab("SC") + ylab("RRB") + ylim(0,1) + xlim(0,1) +  ggtitle("NDAR Replication")
p2_bottom_left = p_rep + guides(colour=FALSE)
ggsave(filename = file.path(plotpath, sprintf("final_NDAR_Rep_scatterplot_z%s.pdf",as.character(z_thresh))), plot = p2_bottom_left)
p_rep

table(Dverbal_Replication$z_ds_group)
## 
##  RRB_over_SC SC_equal_RRB  SC_over_RRB 
##          236          432          222
cor_res = cor.test(Dverbal_Replication$dbaes_atotal, Dverbal_Replication$dbaes_btotal)
cor_res
## 
##  Pearson's product-moment correlation
## 
## data:  Dverbal_Replication$dbaes_atotal and Dverbal_Replication$dbaes_btotal
## t = 7.1459, df = 888, p-value = 1.864e-12
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1700824 0.2943934
## sample estimates:
##       cor 
## 0.2331903
#------------------------------------------------------------------------------
# Run supervised model with Discovery as training and Replication as Test

# run validation
# make subtypes using z-scores computed from the mean and sd of the training set
train_data = Dverbal_Discovery
test_data = Dverbal_Replication

mean2use = mean(train_data[,vars2use[1]] - train_data[,vars2use[2]])
sd2use = sd(train_data[,vars2use[1]] - train_data[,vars2use[2]])
tmp_train = make_subtype(data2use = train_data,
                         z_thresh = z_thresh,
                         mean2use = mean2use,
                         sd2use = sd2use)

mean2use = mean(test_data[,vars2use[1]] - test_data[,vars2use[2]])
sd2use = sd(test_data[,vars2use[1]] - test_data[,vars2use[2]])
tmp_test = make_subtype(data2use = test_data,
                        z_thresh = z_thresh,
                        mean2use = mean2use,
                        sd2use = sd2use)

#==============================================================================
#==============================================================================
# make labels based on train mean and sd
mean2use = mean(train_data[,vars2use[1]] - train_data[,vars2use[2]])
sd2use = sd(train_data[,vars2use[1]] - train_data[,vars2use[2]])
train_mean = mean2use
train_sd = sd2use

pred_labels = make_subtype(data2use = test_data,
                        z_thresh = z_thresh,
                        mean2use = train_mean,
                        sd2use = train_sd)
confmat = table(tmp_test$z_ds_group,pred_labels$z_ds_group)
acc = (confmat[1,1]+confmat[2,2]+confmat[3,3])/dim(pred_labels)[1]

# # compute model
# mod2use = svm(x = tmp_train[,vars2use], y = tmp_train$z_ds_group)
# pred_labels = predict(mod2use, tmp_test[,vars2use])
# confmat = table(tmp_test$z_ds_group,pred_labels)
# acc = (confmat[1,1]+confmat[2,2]+confmat[3,3])/length(pred_labels)
#==============================================================================
#==============================================================================

# plot confusion matrix
setHook("grid.newpage", function() pushViewport(viewport(x=1,y=1,width=0.9, height=0.9, name="vp", just=c("right","top"))), action="prepend")
pheatmap(confmat/rowSums(confmat)*100, display_numbers = confmat, color = colorRampPalette(c('white','red'))(100), cluster_rows = FALSE, cluster_cols = FALSE, fontsize_number = fontSize, fontsize_row = fontSize, fontsize_col = fontSize,labels_row = c("RRB>SC","SC=RRB","SC>RRB"),labels_col = c("RRB>SC","SC=RRB","SC>RRB"),angle_col=90,
         breaks= seq(0,100, length=100))
setHook("grid.newpage", NULL, "replace")
grid::grid.text("Actual Labels", y=-0.07, gp=gpar(fontsize=fontSize))
grid::grid.text("Predicted Labels", x=-0.07, rot=90, gp=gpar(fontsize=fontSize))

# show accuracy
true_accuracy = acc
true_accuracy
## [1] 0.9820225
#================================================================================
#================================================================================
# #------------------------------------------------------------------------------
# # Permute subtype labels to examine how well supervised model performs
#
# # nperm = 10000
#
# # make subtypes using z-scores computed from the mean and sd of the training set
# train_data = Dverbal_Discovery
# test_data = Dverbal_Replication
# mean2use = mean(train_data[,vars2use[1]] - train_data[,vars2use[2]])
# sd2use = sd(train_data[,vars2use[1]] - train_data[,vars2use[2]])
# tmp_train = make_subtype(data2use = train_data,
#                          z_thresh = z_thresh,
#                          mean2use = mean2use,
#                          sd2use = sd2use)
#
# mean2use = mean(test_data[,vars2use[1]] - test_data[,vars2use[2]])
# sd2use = sd(test_data[,vars2use[1]] - test_data[,vars2use[2]])
# tmp_test = make_subtype(data2use = test_data,
#                         z_thresh = z_thresh,
#                         mean2use = mean2use,
#                         sd2use = sd2use)
#
# acc = vector(length = nperm)
# for (iperm in 1:nperm){
#   # set seed for reproducibility
#   set.seed(iperm)
#
#   sc_perm = sample(train_data[,vars2use[1]])
#   rrb_perm = sample(train_data[,vars2use[2]])
#   perm_mean2use = mean(sc_perm - rrb_perm)
#   perm_sd2use = sd(sc_perm - rrb_perm)
#   # perm_mean2use = mean(train_data[,vars2use[1]] - rrb_perm)
#   # perm_sd2use = sd(train_data[,vars2use[1]] - rrb_perm)
#   pred_labels = make_subtype(data2use = tmp_test,
#                         z_thresh = z_thresh,
#                         mean2use = perm_mean2use,
#                         sd2use = perm_sd2use)
#   confmat = table(tmp_test$z_ds_group,pred_labels$z_ds_group)
#   acc = (confmat[1,1]+confmat[2,2]+confmat[3,3])/dim(pred_labels)[1]
#
#   # compute model
#   permuted_labels = sample(tmp_train$z_ds_group)
#   mod2use = svm(x = tmp_train[,vars2use], y = permuted_labels)
#   pred_labels = predict(mod2use, tmp_test[,vars2use])
#   confmat = table(tmp_test$z_ds_group,pred_labels)
#   acc[iperm] = (confmat[1,1]+confmat[2,2]+confmat[3,3])/length(pred_labels)
#   #============================================================================
# } # for (iperm in 1:nperm)
#
# df2plot = data.frame(Accuracy = acc)
# p = ggplot(data = df2plot, aes(x = Accuracy)) + geom_histogram() + geom_vline(xintercept=true_accuracy)
# p
#
# # compute p-value
# pval = sum(c(true_accuracy,acc)>=true_accuracy)/(nperm+1)
# pval
#================================================================================
#================================================================================

#------------------------------------------------------------------------------
# Plot difference score Z subtypes in Discovery set
maxScores = c(3,4)

# Discovery - make plot with all individuals shown as lines
df2use = Dverbal_Discovery[,c("subjectkey",adi_total_vars2use)]
df2use$subgrp = factor(tmp_train$z_ds_group)
df2use = data.frame(df2use)
df2use$subjectkey = factor(df2use$subjectkey)
df2use$SC = df2use$dbaes_atotal
df2use$RRB = df2use$dbaes_btotal

df4plot = melt(df2use,
               id.vars = c("subjectkey","subgrp"),
               measure.vars = c("SC","RRB"))

p = ggplot(data = df4plot, aes(x = variable,
                               y = value,
                               colour = subgrp,
                               group = subjectkey)) + facet_grid(. ~ subgrp)
p = p + geom_point(shape=1) + geom_line(alpha = 0.2) + ylim(0,1) + guides(color=FALSE)
p = p + ylab("Percent Severity") + xlab("ADI-R subscale")
p3_middle_top = p + guides(colour=FALSE)
ggsave(filename = file.path(plotpath, sprintf("final_NDAR_Disc_jitterplot_z%s.pdf",as.character(z_thresh))), plot = p3_middle_top)
p

# Plot difference score Z subtypes in Replication set
maxScores = c(3,4)

# Replication - make plot with all individuals shown as lines
df2use = Dverbal_Replication[,c("subjectkey",adi_total_vars2use)]
df2use$subgrp = factor(tmp_test$z_ds_group)
df2use = data.frame(df2use)
df2use$subjectkey = factor(df2use$subjectkey)
df2use$SC = df2use$dbaes_atotal
df2use$RRB = df2use$dbaes_btotal

df4plot = melt(df2use,
               id.vars = c("subjectkey","subgrp"),
               measure.vars = c("SC","RRB"))

p = ggplot(data = df4plot, aes(x = variable,
                               y = value,
                               colour = subgrp,
                               group = subjectkey)) + facet_grid(. ~ subgrp)
p = p + geom_point(shape=1) + geom_line(alpha = 0.2) + ylim(0,1) + guides(color=FALSE)
p = p + ylab("Percent Severity") + xlab("ADI-R subscale")
p4_middle_bottom = p + guides(colour=FALSE)
ggsave(filename = file.path(plotpath, sprintf("final_NDAR_Rep_jitterplot_z%s.pdf",as.character(z_thresh))), plot = p4_middle_bottom)
p

#------------------------------------------------------------------------------
# Apply NDAR subtypes to EU-AIMS LEAP data

# make SC and RRB percentages since EU-AIMS data is specified as percentages
Dverbal = read.csv(file.path(datapath,"tidy_verbal.csv"))
euaims_data = read.csv(file.path(datapath,"tidy_euaims.csv"))
mask1 = euaims_data$Diagnosis=="ASD"
mask2 =  (is.na(euaims_data$A1_pct_severity) | is.na(euaims_data$A2_pct_severity) | is.na(euaims_data$A3_pct_severity) | is.na(euaims_data$B1_pct_severity) | is.na(euaims_data$B2_pct_severity) | is.na(euaims_data$B3_pct_severity) | is.na(euaims_data$B4_pct_severity))
euaims_data = subset(euaims_data, (mask1 & !mask2))

Dverbal[,vars2use[1]] = Dverbal[,vars2use[1]]
Dverbal[,vars2use[2]] = Dverbal[,vars2use[2]]
euaims_data[,vars2use[1]] = (euaims_data$A1_pct_severity +
                               euaims_data$A2_pct_severity +
                               euaims_data$A3_pct_severity)/3
euaims_data[,vars2use[2]] = (euaims_data$B1_pct_severity +
                               euaims_data$B2_pct_severity +
                               euaims_data$B3_pct_severity +
                               euaims_data$B4_pct_severity)/4

train_data = Dverbal
test_data = euaims_data

mean2use = mean(train_data[,vars2use[1]] - train_data[,vars2use[2]], na.rm=TRUE)
sd2use = sd(train_data[,vars2use[1]] - train_data[,vars2use[2]], na.rm=TRUE)
c(mean2use, sd2use)
## [1] -0.01045243  0.19482749
tmp_train = make_subtype(data2use = train_data,
                         z_thresh = z_thresh,
                         mean2use = mean2use,
                         sd2use = sd2use)

tmp_test = make_subtype(data2use = test_data,
                        z_thresh = z_thresh,
                        mean2use = mean2use,
                        sd2use = sd2use)

#===========================================================================
#===========================================================================
# # compute model
# mod2use = svm(x = tmp_train[,vars2use], y = tmp_train$z_ds_group)
# pred_labels = predict(mod2use, tmp_test[,vars2use])
# confmat = table(tmp_test$z_ds_group,pred_labels)
# acc = (confmat[1,1]+confmat[2,2]+confmat[3,3])/length(pred_labels)
#
# tmp_test$svm_pred_labels = pred_labels
#
# # plot confusion matrix
# setHook("grid.newpage", function() pushViewport(viewport(x=1,y=1,width=0.9, height=0.9, name="vp", just=c("right","top"))), action="prepend")
# pheatmap(confmat/rowSums(confmat)*100, display_numbers = confmat, color = colorRampPalette(c('white','red'))(100), cluster_rows = FALSE, cluster_cols = FALSE, fontsize_number = fontSize, fontsize_row = fontSize, fontsize_col = fontSize,labels_row = c("RRB>SC","SC=RRB","SC>RRB"),labels_col = c("RRB>SC","SC=RRB","SC>RRB"),angle_col=90,
#          breaks= seq(0,100, length=100))
# setHook("grid.newpage", NULL, "replace")
# grid::grid.text("Actual Labels", y=-0.07, gp=gpar(fontsize=fontSize))
# grid::grid.text("Predicted Labels", x=-0.07, rot=90, gp=gpar(fontsize=fontSize))
#===========================================================================
#===========================================================================

# scatterplot
p1 = ggplot(data = tmp_train, aes(x = dbaes_atotal, y = dbaes_btotal, colour = factor(z_ds_group))) + geom_point() + xlab("Social-Communication") + ylab("Restricted Repetitive Behaviors") + ylim(0,0.8) + xlim(0,0.8) + ggtitle("NDAR ALL")
p1

p2 = ggplot(data = tmp_test, aes(x = dbaes_atotal, y = dbaes_btotal, colour = factor(z_ds_group))) + geom_point() + xlab("Social-Communication") + ylab("Restricted Repetitive Behaviors") + ylim(0,0.8) + xlim(0,0.8) + ggtitle("EU-AIMS with Groups from NDAR ALL")
p2

#===========================================================================
#===========================================================================
# p3 = ggplot(data = tmp_test, aes(x = dbaes_atotal, y = dbaes_btotal, colour = factor(pred_labels))) + geom_point() + xlab("SC") + ylab("RRB") + ylim(0,0.8) + xlim(0,0.8) + ggtitle("EU-AIMS")
# p5_bottom_right = p3 + guides(colour=FALSE)
# ggsave(filename = file.path(plotpath, sprintf("final_EUAIMS_scatterplot_z%s.pdf",as.character(z_thresh))), plot = p5_bottom_right)
# p3
#===========================================================================
#===========================================================================

# # write out EU-AIMS LEAP data with subgroups defined by NDAR All
write.csv(tmp_test, file.path(datapath, sprintf("tidy_euaims_NDAR_subtypes_diffscore_z%s.csv",as.character(z_thresh))))

#===========================================================================
#===========================================================================
# # Make final plot
# p_final = p1_top_left + p3_middle_top + plot_spacer() + p2_bottom_left + p4_middle_bottom + p5_bottom_right + plot_layout(nrow=3, ncol=3, widths = c(4,4,4), heights = c(8,8,8))
# ggsave(filename = file.path(plotpath, sprintf("final_NDAR_EUAIMS_subtypes_plot_z%s.pdf",as.character(z_thresh))), plot = p_final)
# p_final
#===========================================================================
#===========================================================================

#------------------------------------------------------------------------------
# Integrate subtypes with rest of EU-AIMS LEAP data
euaims_data = read.csv(file.path(datapath,"tidy_euaims.csv"))
td_df = subset(euaims_data, euaims_data$Diagnosis=="TD",select=2:6)
td_df$A_pct_severity = as.numeric(NA)
td_df$B_pct_severity = as.numeric(NA)
td_df$subgrp = "TD"

#===========================================================================
#===========================================================================
# cols2use = c("subid","age","Centre","Schedule","Diagnosis","dbaes_atotal","dbaes_btotal","svm_pred_labels")
cols2use = c("subid","age","Centre","Schedule","Diagnosis","dbaes_atotal","dbaes_btotal","z_ds_group")
#===========================================================================
#===========================================================================
tmp_asd = tmp_test[,cols2use]
colnames(tmp_asd)[6] = "A_pct_severity"
colnames(tmp_asd)[7] = "B_pct_severity"
colnames(tmp_asd)[8] = "subgrp"
asd_df = tmp_asd

all_data = rbind(td_df,asd_df)

fname = "/Users/mlombardo/Dropbox/euaims/data/rsfmri_preproc/euaims_preproc.xlsx"
pp_data = read_excel(fname)
mask = pp_data$notes=="ok"
pp_data = subset(pp_data, mask)

asd_df = merge(pp_data[,c("subid","sex")],asd_df, by = "subid")
td_df = merge(pp_data[,c("subid","sex")],td_df, by = "subid")

all_data = rbind(td_df,asd_df)

data2write = merge(pp_data, all_data, by = "subid")
data2write$age = data2write$age.x
data2write$sex = data2write$sex.y
print(table(data2write$Schedule, data2write$subgrp))
##    
##     RRB_over_SC SC_equal_RRB SC_over_RRB TD
##   A           8           32          40 78
##   B           6           35          44 83
##   C           4           26          33 59
##   D           1            7          30 23
print(table(data2write$Centre, data2write$subgrp))
##                
##                 RRB_over_SC SC_equal_RRB SC_over_RRB TD
##   CAMBRIDGE               4           24          14 29
##   KINGS_COLLEGE          10           39          57 78
##   MANNHEIM                0            0           0 34
##   NIJMEGEN                5           31          57 64
##   UTRECHT                 0            6          19 38
print(table(data2write$sex, data2write$subgrp))
##         
##          RRB_over_SC SC_equal_RRB SC_over_RRB  TD
##   Female           6           25          40  88
##   Male            13           75         107 155
#DSM-5 - find best split that balances participants across sites
a = findBestSplit(asd_df, seed_range = c(172342)) #,300001:500000))
## [1] 172342
best_seeds = a$seed[!is.na(a$discrepancy) & a$discrepancy==min(a$discrepancy, na.rm = TRUE)]
print(best_seeds)
## [1] 172342
# Split datasets -------------------------------------------------------------
rngSeed = best_seeds[1]

# split Schedule A dataset
dset2use = subset(asd_df, asd_df$Schedule=="A")
tmp_d = SplitDatasetsBySex(dset2use, rngSeed = rngSeed)
A_Discovery = tmp_d[[2]]
A_Replication = tmp_d[[1]]

# split Schedule B dataset
dset2use = subset(asd_df, asd_df$Schedule=="B")
tmp_d = SplitDatasetsBySex(dset2use, rngSeed = rngSeed)
B_Discovery = tmp_d[[2]]
B_Replication = tmp_d[[1]]

# split Schedule C dataset
dset2use = subset(asd_df, asd_df$Schedule=="C")
tmp_d = SplitDatasetsBySex(dset2use, rngSeed = rngSeed)
C_Discovery = tmp_d[[2]]
C_Replication = tmp_d[[1]]

# split Schedule D dataset
dset2use = subset(asd_df, asd_df$Schedule=="D")
tmp_d = SplitDatasetsBySex(dset2use, rngSeed = rngSeed)
D_Discovery = tmp_d[[2]]
D_Replication = tmp_d[[1]]

df_Disc = rbind(A_Discovery, B_Discovery, C_Discovery, D_Discovery)
df_Rep = rbind(A_Replication, B_Replication, C_Replication, D_Replication)

a = table(df_Disc$Schedule, df_Disc$Centre); print(a)
##    
##     CAMBRIDGE KINGS_COLLEGE NIJMEGEN UTRECHT
##   A         6            17       10       7
##   B         9            16       14       3
##   C         6             9       14       2
##   D         0            10       10       0
b = table(df_Rep$Schedule, df_Rep$Centre); print(b)
##    
##     CAMBRIDGE KINGS_COLLEGE NIJMEGEN UTRECHT
##   A         7            18       10       5
##   B         8            17       13       5
##   C         6            10       13       3
##   D         0             9        9       0
print(a-b)
##    
##     CAMBRIDGE KINGS_COLLEGE NIJMEGEN UTRECHT
##   A        -1            -1        0       2
##   B         1            -1        1      -2
##   C         0            -1        1      -1
##   D         0             1        1       0
print(sum(rowSums(abs(a-b))))
## [1] 14
data2write$dataset = NA
mask  = is.element(data2write$subid,df_Disc$subid)
data2write[mask,"dataset"] = "Discovery"
mask  = is.element(data2write$subid,df_Rep$subid)
data2write[mask,"dataset"] = "Replication"

asd_Disc = subset(data2write, data2write$dataset=="Discovery" & data2write$Diagnosis=="ASD")
asd_Rep = subset(data2write, data2write$dataset=="Replication" & data2write$Diagnosis=="ASD")

# # find which seed gives best TD age-match -------------------------------------
# seeds = 1:1000
# pvals = data.frame(matrix(nrow = length(seeds), ncol = 2))
# for (i in 1:length(seeds)) {
#   res = findTDAgeMatch(data2write, seed_range = c(seeds[i],seeds[i]))
#   td_Disc_matched = res[[2]]
#   td_Rep_matched = res[[1]]
#   tres = t.test(td_Disc_matched$age, asd_Disc$age)
#   pvals[i,1] = tres$p.value
#   tres = t.test(td_Rep_matched$age, asd_Rep$age)
#   pvals[i,2] = tres$p.value
#   #print(i)
# }
# a = sort.int(pvals[,1], decreasing = TRUE, index.return = TRUE)
# b=pvals[a$ix,]

seed2use = 929
res = findTDAgeMatch(data2write, seed_range = c(seed2use,seed2use))
td_Disc_matched = res[[2]]
td_Rep_matched = res[[1]]

mask = is.element(data2write$subid, td_Disc_matched$subid)
data2write$dataset[mask] = "Discovery"

mask = is.element(data2write$subid, td_Rep_matched$subid)
data2write$dataset[mask] = "Replication"

print(table(data2write$dataset, data2write$subgrp))
##              
##               RRB_over_SC SC_equal_RRB SC_over_RRB  TD
##   Discovery            13           48          72 121
##   Replication           6           52          75 122
fname2save = here(sprintf("asd_subgrp_data_rsfmri_ALL_DSM5_diffzscoreGrps_z%s.csv",as.character(z_thresh)))
write.csv(data2write,file = fname2save)


#------------------------------------------------------------------------------
# Descriptive stats
df2use = data2write[,c("subid","Diagnosis","dataset","Centre","meanFD","age","sex","subgrp","viq_all","piq_all","fsiq4_all","A_pct_severity","B_pct_severity","ADI_social_total","ADI_communication_total","ADI_RRB_total","ados_2_SA_CSS","ados_2_RRB_CSS","SRS_tscore","SRS_tscore_self","RBS_total","SSP_total","vabsdscoresc_dss","vabsdscoresd_dss","vabsdscoress_dss","vabsabcabc_standard")]
mask = df2use==999 | df2use==777
df2use[mask] = NA
df2use$age = df2use$age/365

cols2use = c("dataset","subgrp","age","meanFD","viq_all","piq_all","fsiq4_all","A_pct_severity","B_pct_severity","ADI_social_total","ADI_communication_total","ADI_RRB_total","ados_2_SA_CSS","ados_2_RRB_CSS","SRS_tscore","SRS_tscore_self","RBS_total","SSP_total","vabsdscoresc_dss","vabsdscoresd_dss","vabsdscoress_dss","vabsabcabc_standard")
res=describeBy(x = df2use[,cols2use], group = c("subgrp","dataset"))
res
## 
##  Descriptive statistics by group 
## subgrp: RRB_over_SC
## dataset: Discovery
##                         vars  n   mean    sd median trimmed   mad    min    max
## dataset*                   1 13    NaN    NA     NA     NaN    NA    Inf   -Inf
## subgrp*                    2 13    NaN    NA     NA     NaN    NA    Inf   -Inf
## age                        3 13  17.31  5.96  18.04   17.56  7.26   7.89  24.03
## meanFD                     4 13   0.24  0.28   0.18    0.18  0.04   0.06   1.14
## viq_all                    5 13  97.69 14.94  96.00   97.55 19.27  73.00 123.85
## piq_all                    6 13  99.46 15.95  99.00  100.00 13.34  64.00 129.00
## fsiq4_all                  7 13  98.70 14.08  96.00   99.19 11.86  74.00 118.01
## A_pct_severity             8 13   0.19  0.13   0.14    0.18  0.14   0.00   0.41
## B_pct_severity             9 13   0.41  0.14   0.42    0.41  0.15   0.13   0.59
## ADI_social_total          10 13  16.69  8.10  20.00   17.18  7.41   2.00  26.00
## ADI_communication_total   11 13  14.77  6.52  16.00   15.09  5.93   2.00  24.00
## ADI_RRB_total             12 13   7.69  2.25   8.00    7.91  1.48   3.00  10.00
## ados_2_SA_CSS             13 13   4.62  2.96   3.00    4.55  2.97   1.00   9.00
## ados_2_RRB_CSS            14 13   4.38  3.88   1.00    4.18  0.00   1.00  10.00
## SRS_tscore                15  9  74.33 10.52  75.00   74.33 13.34  58.00  90.00
## SRS_tscore_self           16  6  60.17  5.88  62.00   60.17  3.71  49.00  65.00
## RBS_total                 17  8  18.75  9.08  19.00   18.75 10.38   7.00  33.00
## SSP_total                 18  6 137.50 21.57 132.00  137.50 21.50 116.00 167.00
## vabsdscoresc_dss          19  9  75.22 22.37  77.00   75.22 11.86  29.00 115.00
## vabsdscoresd_dss          20  9  63.78 14.25  65.00   63.78 11.86  31.00  79.00
## vabsdscoress_dss          21  9  64.89 14.06  69.00   64.89 11.86  35.00  82.00
## vabsabcabc_standard       22  9  70.89  8.25  70.00   70.89  5.93  59.00  88.00
##                         range  skew kurtosis   se
## dataset*                 -Inf    NA       NA   NA
## subgrp*                  -Inf    NA       NA   NA
## age                     16.14 -0.27    -1.67 1.65
## meanFD                   1.09  2.59     5.61 0.08
## viq_all                 50.85 -0.06    -1.19 4.14
## piq_all                 65.00 -0.29    -0.05 4.42
## fsiq4_all               44.01 -0.10    -1.29 3.91
## A_pct_severity           0.41  0.25    -1.40 0.04
## B_pct_severity           0.46 -0.52    -0.97 0.04
## ADI_social_total        24.00 -0.45    -1.40 2.25
## ADI_communication_total 22.00 -0.36    -1.00 1.81
## ADI_RRB_total            7.00 -0.70    -0.64 0.62
## ados_2_SA_CSS            8.00  0.31    -1.64 0.82
## ados_2_RRB_CSS           9.00  0.24    -1.96 1.08
## SRS_tscore              32.00 -0.01    -1.46 3.51
## SRS_tscore_self         16.00 -0.99    -0.72 2.40
## RBS_total               26.00  0.18    -1.44 3.21
## SSP_total               51.00  0.28    -1.97 8.80
## vabsdscoresc_dss        86.00 -0.35     0.15 7.46
## vabsdscoresd_dss        48.00 -1.13     0.32 4.75
## vabsdscoress_dss        47.00 -0.80    -0.38 4.69
## vabsabcabc_standard     29.00  0.64    -0.39 2.75
## ------------------------------------------------------------ 
## subgrp: SC_equal_RRB
## dataset: Discovery
##                         vars  n   mean    sd median trimmed   mad   min    max
## dataset*                   1 48    NaN    NA     NA     NaN    NA   Inf   -Inf
## subgrp*                    2 48    NaN    NA     NA     NaN    NA   Inf   -Inf
## age                        3 48  16.25  5.89  14.92   15.91  4.95  7.08  30.28
## meanFD                     4 48   0.32  0.56   0.23    0.23  0.15  0.04   3.95
## viq_all                    5 47  97.22 18.05  97.35   97.31 19.65 64.00 136.00
## piq_all                    6 47 100.08 19.23 102.80  100.65 16.60 61.00 142.00
## fsiq4_all                  7 48  98.91 17.44 103.86   99.47 19.27 60.00 131.00
## A_pct_severity             8 48   0.32  0.14   0.32    0.31  0.14  0.03   0.63
## B_pct_severity             9 48   0.32  0.16   0.29    0.32  0.15  0.01   0.69
## ADI_social_total          10 48  17.40  6.89  18.50   17.73  6.67  3.00  27.00
## ADI_communication_total   11 48  14.29  6.17  14.00   14.38  7.41  0.00  26.00
## ADI_RRB_total             12 48   5.54  2.39   5.50    5.60  2.22  0.00  12.00
## ados_2_SA_CSS             13 47   6.17  2.65   6.00    6.23  2.97  1.00  10.00
## ados_2_RRB_CSS            14 47   5.11  2.74   5.00    5.13  2.97  1.00  10.00
## SRS_tscore                15 44  71.30 11.81  72.50   71.47 13.34 47.00  90.00
## SRS_tscore_self           16 26  62.19 11.97  65.00   61.95 12.60 43.00  89.00
## RBS_total                 17 43  17.05 13.47  15.00   15.20 11.86  0.00  53.00
## SSP_total                 18 27 134.19 29.86 137.00  134.35 25.20 81.00 187.00
## vabsdscoresc_dss          19 47  72.26 18.04  75.00   73.21 13.34 21.00 122.00
## vabsdscoresd_dss          20 46  72.76 14.52  72.00   73.16 10.38 25.00 105.00
## vabsdscoress_dss          21 47  71.96 14.97  74.00   73.49 11.86 20.00  95.00
## vabsabcabc_standard       22 46  70.17 13.26  72.00   71.11  9.64 20.00 100.00
##                          range  skew kurtosis   se
## dataset*                  -Inf    NA       NA   NA
## subgrp*                   -Inf    NA       NA   NA
## age                      23.20  0.55    -0.59 0.85
## meanFD                    3.91  5.62    33.37 0.08
## viq_all                  72.00 -0.11    -0.77 2.63
## piq_all                  81.00 -0.31    -0.59 2.81
## fsiq4_all                71.00 -0.33    -0.79 2.52
## A_pct_severity            0.60  0.12    -0.66 0.02
## B_pct_severity            0.68  0.33    -0.35 0.02
## ADI_social_total         24.00 -0.44    -0.92 0.99
## ADI_communication_total  26.00 -0.15    -0.83 0.89
## ADI_RRB_total            12.00 -0.08    -0.03 0.34
## ados_2_SA_CSS             9.00 -0.21    -1.17 0.39
## ados_2_RRB_CSS            9.00 -0.37    -1.03 0.40
## SRS_tscore               43.00 -0.19    -1.10 1.78
## SRS_tscore_self          46.00  0.07    -0.99 2.35
## RBS_total                53.00  1.15     0.74 2.05
## SSP_total               106.00 -0.10    -1.05 5.75
## vabsdscoresc_dss        101.00 -0.40     1.21 2.63
## vabsdscoresd_dss         80.00 -0.53     1.44 2.14
## vabsdscoress_dss         75.00 -1.18     1.97 2.18
## vabsabcabc_standard      80.00 -1.15     3.22 1.96
## ------------------------------------------------------------ 
## subgrp: SC_over_RRB
## dataset: Discovery
##                         vars  n   mean    sd median trimmed   mad   min    max
## dataset*                   1 72    NaN    NA     NA     NaN    NA   Inf   -Inf
## subgrp*                    2 72    NaN    NA     NA     NaN    NA   Inf   -Inf
## age                        3 72  16.26  5.21  15.88   16.08  5.11  7.56  29.40
## meanFD                     4 72   0.23  0.22   0.15    0.18  0.10  0.03   1.08
## viq_all                    5 71  98.23 19.62 100.00   98.10 20.22 61.00 142.00
## piq_all                    6 71  99.28 22.87 102.00  100.05 21.93 52.43 150.00
## fsiq4_all                  7 72  98.84 19.86 102.25   99.36 19.60 59.00 143.00
## A_pct_severity             8 72   0.42  0.14   0.44    0.42  0.13  0.16   0.82
## B_pct_severity             9 72   0.17  0.10   0.16    0.17  0.12  0.00   0.46
## ADI_social_total          10 72  17.57  6.48  18.00   17.90  8.90  3.00  28.00
## ADI_communication_total   11 72  14.10  4.98  14.50   14.24  5.19  2.00  24.00
## ADI_RRB_total             12 72   3.15  2.13   3.00    3.02  1.48  0.00  10.00
## ados_2_SA_CSS             13 70   6.41  2.51   7.00    6.61  2.97  1.00  10.00
## ados_2_RRB_CSS            14 70   4.67  2.75   5.00    4.59  2.97  1.00  10.00
## SRS_tscore                15 61  73.07 12.14  74.00   73.71 13.34 44.00  95.00
## SRS_tscore_self           16 28  62.96 12.16  61.00   62.04  9.64 42.00  94.00
## RBS_total                 17 59  17.07 17.01  14.00   14.55 16.31  0.00  90.00
## SSP_total                 18 46 138.93 30.98 140.00  140.08 37.06 53.00 189.00
## vabsdscoresc_dss          19 67  72.25 16.33  72.00   73.00 10.38 21.00 107.00
## vabsdscoresd_dss          20 67  71.90 17.73  72.00   71.45 13.34 17.00 131.00
## vabsdscoress_dss          21 67  67.70 16.30  69.00   68.80 13.34 20.00 104.00
## vabsabcabc_standard       22 67  68.51 15.50  70.00   69.27 11.86  6.00 103.00
##                          range  skew kurtosis   se
## dataset*                  -Inf    NA       NA   NA
## subgrp*                   -Inf    NA       NA   NA
## age                      21.85  0.33    -0.50 0.61
## meanFD                    1.05  2.20     4.74 0.03
## viq_all                  81.00  0.06    -0.64 2.33
## piq_all                  97.57 -0.26    -0.58 2.71
## fsiq4_all                84.00 -0.24    -0.88 2.34
## A_pct_severity            0.66  0.27    -0.27 0.02
## B_pct_severity            0.46  0.47    -0.44 0.01
## ADI_social_total         25.00 -0.34    -0.91 0.76
## ADI_communication_total  22.00 -0.23    -0.60 0.59
## ADI_RRB_total            10.00  0.74     0.31 0.25
## ados_2_SA_CSS             9.00 -0.57    -0.72 0.30
## ados_2_RRB_CSS            9.00 -0.19    -1.15 0.33
## SRS_tscore               51.00 -0.35    -0.40 1.55
## SRS_tscore_self          52.00  1.00     0.54 2.30
## RBS_total                90.00  1.80     4.43 2.21
## SSP_total               136.00 -0.40    -0.39 4.57
## vabsdscoresc_dss         86.00 -0.77     2.04 2.00
## vabsdscoresd_dss        114.00  0.25     1.86 2.17
## vabsdscoress_dss         84.00 -0.72     0.83 1.99
## vabsabcabc_standard      97.00 -1.17     3.91 1.89
## ------------------------------------------------------------ 
## subgrp: TD
## dataset: Discovery
##                         vars   n   mean    sd median trimmed   mad    min
## dataset*                   1 121    NaN    NA     NA     NaN    NA    Inf
## subgrp*                    2 121    NaN    NA     NA     NaN    NA    Inf
## age                        3 121  16.83  5.23  16.65   16.73  5.69   7.22
## meanFD                     4 121   0.18  0.15   0.13    0.15  0.07   0.03
## viq_all                    5 119 104.52 19.70 105.00  105.03 19.27  46.00
## piq_all                    6 119 106.10 19.47 107.00  107.53 17.79  49.00
## fsiq4_all                  7 119 105.72 18.33 108.18  106.99 16.58  53.00
## A_pct_severity             8   0    NaN    NA     NA     NaN    NA    Inf
## B_pct_severity             9   0    NaN    NA     NA     NaN    NA    Inf
## ADI_social_total          10   0    NaN    NA     NA     NaN    NA    Inf
## ADI_communication_total   11   0    NaN    NA     NA     NaN    NA    Inf
## ADI_RRB_total             12   0    NaN    NA     NA     NaN    NA    Inf
## ados_2_SA_CSS             13   0    NaN    NA     NA     NaN    NA    Inf
## ados_2_RRB_CSS            14   0    NaN    NA     NA     NaN    NA    Inf
## SRS_tscore                15  68  47.84  9.40  45.00   46.32  5.19  37.00
## SRS_tscore_self           16  71  46.69  4.85  46.00   46.26  4.45  39.00
## RBS_total                 17  68   2.15  4.74   0.00    0.95  0.00   0.00
## SSP_total                 18  59 177.86 12.71 182.00  179.78  8.90 122.00
## vabsdscoresc_dss          19  34  91.97 25.44  99.50   93.50 21.50  21.00
## vabsdscoresd_dss          20  34  90.74 20.28  98.50   92.25 17.05  33.00
## vabsdscoress_dss          21  34  96.21 23.67 102.50   98.57 21.50  33.00
## vabsabcabc_standard       22  34  92.06 23.04  99.50   93.89 15.57  25.00
##                            max  range  skew kurtosis   se
## dataset*                  -Inf   -Inf    NA       NA   NA
## subgrp*                   -Inf   -Inf    NA       NA   NA
## age                      29.84  22.62  0.17    -0.51 0.48
## meanFD                    0.85   0.82  2.28     5.65 0.01
## viq_all                 160.00 114.00 -0.24     0.38 1.81
## piq_all                 147.00  98.00 -0.69     0.32 1.79
## fsiq4_all               142.00  89.00 -0.69     0.58 1.68
## A_pct_severity            -Inf   -Inf    NA       NA   NA
## B_pct_severity            -Inf   -Inf    NA       NA   NA
## ADI_social_total          -Inf   -Inf    NA       NA   NA
## ADI_communication_total   -Inf   -Inf    NA       NA   NA
## ADI_RRB_total             -Inf   -Inf    NA       NA   NA
## ados_2_SA_CSS             -Inf   -Inf    NA       NA   NA
## ados_2_RRB_CSS            -Inf   -Inf    NA       NA   NA
## SRS_tscore               76.00  39.00  1.54     1.44 1.14
## SRS_tscore_self          63.00  24.00  0.93     1.10 0.58
## RBS_total                27.00  27.00  3.13    10.87 0.57
## SSP_total               190.00  68.00 -1.90     4.85 1.66
## vabsdscoresc_dss        138.00 117.00 -0.71     0.36 4.36
## vabsdscoresd_dss        121.00  88.00 -0.80     0.07 3.48
## vabsdscoress_dss        129.00  96.00 -0.83    -0.31 4.06
## vabsabcabc_standard     127.00 102.00 -0.90     0.30 3.95
## ------------------------------------------------------------ 
## subgrp: RRB_over_SC
## dataset: Replication
##                         vars n   mean    sd median trimmed   mad    min    max
## dataset*                   1 6    NaN    NA     NA     NaN    NA    Inf   -Inf
## subgrp*                    2 6    NaN    NA     NA     NaN    NA    Inf   -Inf
## age                        3 6  14.70  3.15  13.73   14.70  2.60  11.45  19.56
## meanFD                     4 6   0.20  0.10   0.20    0.20  0.08   0.10   0.38
## viq_all                    5 6 106.24 18.88 100.86  106.24  9.12  91.00 143.00
## piq_all                    6 6 108.81 22.32 101.90  108.81 17.62  89.00 148.00
## fsiq4_all                  7 6 108.00 20.20 101.86  108.00  6.47  93.00 148.00
## A_pct_severity             8 6   0.15  0.08   0.16    0.15  0.05   0.04   0.27
## B_pct_severity             9 6   0.32  0.10   0.32    0.32  0.12   0.18   0.45
## ADI_social_total          10 6  13.83  5.12  14.50   13.83  4.45   5.00  19.00
## ADI_communication_total   11 6   7.83  3.13   8.00    7.83  3.71   3.00  11.00
## ADI_RRB_total             12 6   4.50  2.07   4.50    4.50  1.48   1.00   7.00
## ados_2_SA_CSS             13 6   4.67  2.73   5.00    4.67  3.71   1.00   8.00
## ados_2_RRB_CSS            14 6   4.33  2.73   5.00    4.33  2.97   1.00   7.00
## SRS_tscore                15 5  66.20 12.26  72.00   66.20 10.38  48.00  79.00
## SRS_tscore_self           16 2  64.50  3.54  64.50   64.50  3.71  62.00  67.00
## RBS_total                 17 5  14.60 10.01  13.00   14.60  8.90   5.00  30.00
## SSP_total                 18 3 142.67  9.07 139.00  142.67  4.45 136.00 153.00
## vabsdscoresc_dss          19 6  76.50 11.40  74.00   76.50  4.45  68.00  99.00
## vabsdscoresd_dss          20 6  72.50  6.72  71.00   72.50  4.45  66.00  85.00
## vabsdscoress_dss          21 6  81.50  9.52  82.00   81.50  8.15  67.00  95.00
## vabsabcabc_standard       22 6  69.17 15.55  75.00   69.17  6.67  39.00  81.00
##                         range  skew kurtosis   se
## dataset*                 -Inf    NA       NA   NA
## subgrp*                  -Inf    NA       NA   NA
## age                      8.11  0.41    -1.73 1.29
## meanFD                   0.28  0.66    -1.05 0.04
## viq_all                 52.00  1.09    -0.51 7.71
## piq_all                 59.00  0.72    -1.23 9.11
## fsiq4_all               55.00  1.19    -0.36 8.24
## A_pct_severity           0.23  0.16    -1.25 0.03
## B_pct_severity           0.27 -0.03    -1.75 0.04
## ADI_social_total        14.00 -0.59    -1.26 2.09
## ADI_communication_total  8.00 -0.30    -1.67 1.28
## ADI_RRB_total            6.00 -0.45    -1.25 0.85
## ados_2_SA_CSS            7.00 -0.15    -1.85 1.12
## ados_2_RRB_CSS           6.00 -0.29    -1.96 1.12
## SRS_tscore              31.00 -0.41    -1.76 5.48
## SRS_tscore_self          5.00  0.00    -2.75 2.50
## RBS_total               25.00  0.47    -1.64 4.48
## SSP_total               17.00  0.34    -2.33 5.24
## vabsdscoresc_dss        31.00  1.16    -0.38 4.65
## vabsdscoresd_dss        19.00  0.87    -0.82 2.74
## vabsdscoress_dss        28.00 -0.11    -1.39 3.89
## vabsabcabc_standard     42.00 -1.09    -0.55 6.35
## ------------------------------------------------------------ 
## subgrp: SC_equal_RRB
## dataset: Replication
##                         vars  n   mean    sd median trimmed   mad   min    max
## dataset*                   1 52    NaN    NA     NA     NaN    NA   Inf   -Inf
## subgrp*                    2 52    NaN    NA     NA     NaN    NA   Inf   -Inf
## age                        3 52  16.67  6.17  16.20   16.35  6.78  7.12  30.15
## meanFD                     4 52   0.22  0.15   0.18    0.19  0.11  0.06   0.76
## viq_all                    5 51 101.96 15.54 104.00  102.52 19.27 70.00 133.00
## piq_all                    6 51 104.75 17.53 106.00  105.33 20.76 66.42 134.00
## fsiq4_all                  7 51 103.61 16.05 106.00  104.04 17.79 69.00 130.00
## A_pct_severity             8 52   0.26  0.12   0.24    0.25  0.12  0.04   0.65
## B_pct_severity             9 52   0.25  0.13   0.24    0.24  0.12  0.00   0.67
## ADI_social_total          10 52  14.79  6.47  15.50   14.98  6.67  1.00  27.00
## ADI_communication_total   11 52  11.71  5.74  11.00   11.64  5.93  0.00  24.00
## ADI_RRB_total             12 52   4.12  2.41   4.00    4.10  2.97  0.00   9.00
## ados_2_SA_CSS             13 51   5.43  2.52   6.00    5.46  2.97  1.00  10.00
## ados_2_RRB_CSS            14 51   4.92  2.46   5.00    4.93  1.48  1.00   9.00
## SRS_tscore                15 47  66.21 11.50  67.00   66.05 13.34 43.00  90.00
## SRS_tscore_self           16 25  60.76  7.79  61.00   60.76  7.41 46.00  79.00
## RBS_total                 17 44  14.09 11.58  11.50   12.83 11.12  0.00  52.00
## SSP_total                 18 30 140.70 27.79 143.00  143.42 34.10 69.00 177.00
## vabsdscoresc_dss          19 48  83.06 15.26  81.00   82.60 14.83 50.00 122.00
## vabsdscoresd_dss          20 48  78.96 15.44  77.50   78.22 12.60 38.00 119.00
## vabsdscoress_dss          21 48  77.62 14.63  79.00   78.28 14.83 30.00 101.00
## vabsabcabc_standard       22 48  78.96 12.86  77.50   78.47  9.64 48.00 117.00
##                          range  skew kurtosis   se
## dataset*                  -Inf    NA       NA   NA
## subgrp*                   -Inf    NA       NA   NA
## age                      23.03  0.35    -0.87 0.86
## meanFD                    0.70  1.61     2.77 0.02
## viq_all                  63.00 -0.20    -0.87 2.18
## piq_all                  67.58 -0.24    -0.83 2.45
## fsiq4_all                61.00 -0.21    -0.95 2.25
## A_pct_severity            0.61  0.54     0.61 0.02
## B_pct_severity            0.67  0.68     0.69 0.02
## ADI_social_total         26.00 -0.23    -0.84 0.90
## ADI_communication_total  24.00  0.10    -0.71 0.80
## ADI_RRB_total             9.00  0.07    -0.71 0.33
## ados_2_SA_CSS             9.00 -0.11    -1.02 0.35
## ados_2_RRB_CSS            8.00 -0.44    -0.74 0.35
## SRS_tscore               47.00  0.10    -0.87 1.68
## SRS_tscore_self          33.00  0.06    -0.31 1.56
## RBS_total                52.00  1.21     1.67 1.75
## SSP_total               108.00 -0.72    -0.41 5.07
## vabsdscoresc_dss         72.00  0.39    -0.16 2.20
## vabsdscoresd_dss         81.00  0.36     0.46 2.23
## vabsdscoress_dss         71.00 -0.54     0.69 2.11
## vabsabcabc_standard      69.00  0.47     0.39 1.86
## ------------------------------------------------------------ 
## subgrp: SC_over_RRB
## dataset: Replication
##                         vars  n   mean    sd median trimmed   mad   min    max
## dataset*                   1 75    NaN    NA     NA     NaN    NA   Inf   -Inf
## subgrp*                    2 75    NaN    NA     NA     NaN    NA   Inf   -Inf
## age                        3 75  16.35  5.14  16.07   16.18  5.91  7.48  29.23
## meanFD                     4 75   0.27  0.31   0.16    0.20  0.11  0.04   1.60
## viq_all                    5 72  96.69 19.94  99.00   97.83 21.68 50.91 130.00
## piq_all                    6 74  98.46 21.74 103.50  100.10 21.50 44.03 138.00
## fsiq4_all                  7 73  98.22 19.99 103.00   99.19 19.81 59.00 139.00
## A_pct_severity             8 75   0.45  0.15   0.45    0.45  0.18  0.16   0.75
## B_pct_severity             9 75   0.20  0.12   0.19    0.19  0.13  0.00   0.47
## ADI_social_total          10 75  18.08  5.79  19.00   18.28  5.93  4.00  29.00
## ADI_communication_total   11 75  14.64  4.80  15.00   14.82  4.45  3.00  24.00
## ADI_RRB_total             12 75   3.77  2.40   3.00    3.61  1.48  0.00  10.00
## ados_2_SA_CSS             13 71   6.27  2.75   6.00    6.39  2.97  1.00  10.00
## ados_2_RRB_CSS            14 71   4.63  2.78   5.00    4.53  2.97  1.00  10.00
## SRS_tscore                15 66  73.64 11.65  76.00   74.24 12.60 48.00  90.00
## SRS_tscore_self           16 36  63.03  9.84  61.50   62.83  6.67 40.00  84.00
## RBS_total                 17 66  17.89 14.88  13.00   16.17 11.12  0.00  73.00
## SSP_total                 18 50 138.68 25.20 138.50  139.03 28.17 91.00 184.00
## vabsdscoresc_dss          19 67  73.79 14.70  75.00   74.42 10.38 21.00 110.00
## vabsdscoresd_dss          20 66  71.53 16.54  68.50   71.06 15.57 42.00 118.00
## vabsdscoress_dss          21 67  65.99 16.35  68.00   66.33 14.83 23.00 112.00
## vabsabcabc_standard       22 66  68.62 13.77  69.00   68.96 11.86 28.00 107.00
##                         range  skew kurtosis   se
## dataset*                 -Inf    NA       NA   NA
## subgrp*                  -Inf    NA       NA   NA
## age                     21.75  0.32    -0.60 0.59
## meanFD                   1.55  2.99     9.05 0.04
## viq_all                 79.09 -0.46    -0.64 2.35
## piq_all                 93.97 -0.61    -0.51 2.53
## fsiq4_all               80.00 -0.42    -0.82 2.34
## A_pct_severity           0.59  0.05    -1.00 0.02
## B_pct_severity           0.47  0.35    -0.77 0.01
## ADI_social_total        25.00 -0.34    -0.62 0.67
## ADI_communication_total 21.00 -0.32    -0.69 0.55
## ADI_RRB_total           10.00  0.58    -0.41 0.28
## ados_2_SA_CSS            9.00 -0.22    -1.06 0.33
## ados_2_RRB_CSS           9.00 -0.15    -1.23 0.33
## SRS_tscore              42.00 -0.34    -0.95 1.43
## SRS_tscore_self         44.00  0.21    -0.04 1.64
## RBS_total               73.00  1.25     1.44 1.83
## SSP_total               93.00 -0.09    -0.98 3.56
## vabsdscoresc_dss        89.00 -0.76     2.30 1.80
## vabsdscoresd_dss        76.00  0.41    -0.15 2.04
## vabsdscoress_dss        89.00 -0.16     0.40 2.00
## vabsabcabc_standard     79.00 -0.29     1.12 1.69
## ------------------------------------------------------------ 
## subgrp: TD
## dataset: Replication
##                         vars   n   mean    sd median trimmed   mad   min    max
## dataset*                   1 122    NaN    NA     NA     NaN    NA   Inf   -Inf
## subgrp*                    2 122    NaN    NA     NA     NaN    NA   Inf   -Inf
## age                        3 122  16.86  6.07  16.34   16.58  7.59  6.89  29.72
## meanFD                     4 122   0.23  0.46   0.14    0.15  0.07  0.04   4.60
## viq_all                    5 122 104.02 17.58 108.18  105.64 12.13 45.00 140.00
## piq_all                    6 122 104.64 18.41 108.96  106.56 14.08 49.00 139.00
## fsiq4_all                  7 122 104.94 17.14 108.09  107.29 11.86 50.00 134.00
## A_pct_severity             8   0    NaN    NA     NA     NaN    NA   Inf   -Inf
## B_pct_severity             9   0    NaN    NA     NA     NaN    NA   Inf   -Inf
## ADI_social_total          10   0    NaN    NA     NA     NaN    NA   Inf   -Inf
## ADI_communication_total   11   0    NaN    NA     NA     NaN    NA   Inf   -Inf
## ADI_RRB_total             12   0    NaN    NA     NA     NaN    NA   Inf   -Inf
## ados_2_SA_CSS             13   0    NaN    NA     NA     NaN    NA   Inf   -Inf
## ados_2_RRB_CSS            14   0    NaN    NA     NA     NaN    NA   Inf   -Inf
## SRS_tscore                15  65  47.23  9.34  44.00   45.66  4.45 37.00  90.00
## SRS_tscore_self           16  61  48.44  6.84  47.00   47.63  5.93 39.00  69.00
## RBS_total                 17  63   3.08 11.54   0.00    0.86  0.00  0.00  89.00
## SSP_total                 18  54 174.93 19.38 182.00  178.41  6.67 75.00 190.00
## vabsdscoresc_dss          19  39  92.74 25.45  96.00   95.82 20.76 21.00 125.00
## vabsdscoresd_dss          20  39  91.10 22.65  97.00   93.91 14.83 27.00 122.00
## vabsdscoress_dss          21  39  98.90 27.04 103.00  102.12 17.79 20.00 132.00
## vabsabcabc_standard       22  38  93.00 25.39 100.00   96.44 15.57 20.00 126.00
##                          range  skew kurtosis   se
## dataset*                  -Inf    NA       NA   NA
## subgrp*                   -Inf    NA       NA   NA
## age                      22.83  0.33    -0.97 0.55
## meanFD                    4.56  7.54    64.83 0.04
## viq_all                  95.00 -0.99     1.51 1.59
## piq_all                  90.00 -0.93     0.67 1.67
## fsiq4_all                84.00 -1.28     1.67 1.55
## A_pct_severity            -Inf    NA       NA   NA
## B_pct_severity            -Inf    NA       NA   NA
## ADI_social_total          -Inf    NA       NA   NA
## ADI_communication_total   -Inf    NA       NA   NA
## ADI_RRB_total             -Inf    NA       NA   NA
## ados_2_SA_CSS             -Inf    NA       NA   NA
## ados_2_RRB_CSS            -Inf    NA       NA   NA
## SRS_tscore               53.00  2.14     5.82 1.16
## SRS_tscore_self          30.00  1.05     0.48 0.88
## RBS_total                89.00  6.60    45.89 1.45
## SSP_total               115.00 -2.88    10.92 2.64
## vabsdscoresc_dss        104.00 -1.21     1.00 4.08
## vabsdscoresd_dss         95.00 -1.34     1.26 3.63
## vabsdscoress_dss        112.00 -1.26     1.10 4.33
## vabsabcabc_standard     106.00 -1.42     1.40 4.12
#------------------------------------------------------------------------------
# Table of subtypes X sex

# Discovery
data2use = subset(data2write,data2write$dataset=="Discovery")
table(data2use$subgrp,data2use$sex)
##               
##                Female Male
##   RRB_over_SC       5    8
##   SC_equal_RRB     12   36
##   SC_over_RRB      18   54
##   TD               41   80
cs_res = chisq.test(data2use$subgrp,data2use$sex)
cs_res
## 
##  Pearson's Chi-squared test
## 
## data:  data2use$subgrp and data2use$sex
## X-squared = 2.7445, df = 3, p-value = 0.4327
# Replication
data2use = subset(data2write,data2write$dataset=="Replication")
table(data2use$subgrp,data2use$sex)
##               
##                Female Male
##   RRB_over_SC       1    5
##   SC_equal_RRB     13   39
##   SC_over_RRB      22   53
##   TD               47   75
cs_res = chisq.test(data2use$subgrp,data2use$sex)
cs_res
## 
##  Pearson's Chi-squared test
## 
## data:  data2use$subgrp and data2use$sex
## X-squared = 4.3766, df = 3, p-value = 0.2236
#------------------------------------------------------------------------------
vars2analyze = c("age","meanFD","viq_all","piq_all","fsiq4_all",
                 "A_pct_severity","B_pct_severity",
                 "ADI_social_total","ADI_communication_total","ADI_RRB_total",
                 "ados_2_SA_CSS","ados_2_RRB_CSS",
                 "SRS_tscore_self","RBS_total","SSP_total",
                 "vabsdscoress_dss","vabsdscoresd_dss","vabsdscoresc_dss","vabsabcabc_standard")

vnames = c("Age","Mean FD","VIQ","PIQ","FIQ","ADI-R SC","ADI-R RRB","ADI-R Social","ADI-R Communication","ADI-R RRB","ADOS SA CSS","ADOS RRB CSS","SRS","RBS","SSP","Vineland Socialization","Vineland Daily Living Skills","Vineland Communication","Vineland ABC")

cnames = c("All_Disc.fstat","All_Disc.pval","All_Rep.fstat","All_Rep.pval",
           "SCequalRRB_vs_SCoverRRB_Disc.fstat","SCequalRRB_vs_SCoverRRB_Disc.tstat",
           "SCequalRRB_vs_SCoverRRB_Disc.pval","SCequalRRB_vs_SCoverRRB_Disc.es",
           "SCequalRRB_vs_SCoverRRB_Rep.fstat","SCequalRRB_vs_SCoverRRB_Rep.tstat",
           "SCequalRRB_vs_SCoverRRB_Rep.pval","SCequalRRB_vs_SCoverRRB_Rep.es",
           "SCequalRRB_vs_SCoverRRB.repBF")

output_res = data.frame(matrix(nrow = length(vars2analyze),ncol = length(cnames)))
colnames(output_res) = cnames
rownames(output_res) = vars2analyze
output_res$varNames = vars2analyze

for (ivar in 1:length(vars2analyze)){

  y_var = vars2analyze[ivar]
  # print(y_var)
  #----------------------------------------------------------------------------
  # Discovery
  df4mod = subset(df2use, df2use$dataset=="Discovery")

  # construct linear model
  # mixed-effect model: site as random factor, all other covariates as fixed factors
  fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
  rx_form = as.formula(sprintf("~ 1|%s","Centre"))
  mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))

  # run ANOVA
  res = anova(mod2use)
  output_res[y_var,"All_Disc.fstat"] = res["subgrp","F-value"]
  output_res[y_var,"All_Disc.pval"] = res["subgrp","p-value"]

  #----------------------------------------------------------------------------
  # Replication
  df4mod = subset(df2use, df2use$dataset=="Replication")

  # construct linear model
  # mixed-effect model: site as random factor, all other covariates as fixed factors
  fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
  rx_form = as.formula(sprintf("~ 1|%s","Centre"))
  mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))

  # run ANOVA
  res = anova(mod2use)
  output_res[y_var,"All_Rep.fstat"] = res["subgrp","F-value"]
  output_res[y_var,"All_Rep.pval"] = res["subgrp","p-value"]

  #----------------------------------------------------------------------------
  # Discovery
  df4mod = subset(df2use, df2use$dataset=="Discovery" & !is.element(df2use$subgrp,c("TD","RRB_over_SC")))
  n1 = sum(df4mod$subgrp=="SC_equal_RRB")
  m1 = sum(df4mod$subgrp=="SC_over_RRB")

  # construct linear model
  # mixed-effect model: site as random factor, all other covariates as fixed factors
  fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
  rx_form = as.formula(sprintf("~ 1|%s","Centre"))
  mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))

  # run ANOVA
  res = anova(mod2use)
  output_res[y_var,"SCequalRRB_vs_SCoverRRB_Disc.fstat"] = res["subgrp","F-value"]
  output_res[y_var,"SCequalRRB_vs_SCoverRRB_Disc.pval"] = res["subgrp","p-value"]
  res = summary(mod2use)
  output_res[y_var,"SCequalRRB_vs_SCoverRRB_Disc.tstat"] = res$tTable[2,"t-value"]
  output_res[y_var,"SCequalRRB_vs_SCoverRRB_Disc.es"] = cohens_d(df4mod[df4mod$subgrp=="SC_equal_RRB",y_var],
                                                                 df4mod[df4mod$subgrp=="SC_over_RRB",y_var])

  #----------------------------------------------------------------------------
  # Replication
  df4mod = subset(df2use, df2use$dataset=="Replication" & !is.element(df2use$subgrp,c("TD","RRB_over_SC")))
  n2 = sum(df4mod$subgrp=="SC_equal_RRB")
  m2 = sum(df4mod$subgrp=="SC_over_RRB")

  # construct linear model
  # mixed-effect model: site as random factor, all other covariates as fixed factors
  fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
  rx_form = as.formula(sprintf("~ 1|%s","Centre"))
  mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))

  # run ANOVA
  res = anova(mod2use)
  output_res[y_var,"SCequalRRB_vs_SCoverRRB_Rep.fstat"] = res["subgrp","F-value"]
  output_res[y_var,"SCequalRRB_vs_SCoverRRB_Rep.pval"] = res["subgrp","p-value"]
  res = summary(mod2use)
  output_res[y_var,"SCequalRRB_vs_SCoverRRB_Rep.tstat"] = res$tTable[2,"t-value"]
  output_res[y_var,"SCequalRRB_vs_SCoverRRB_Rep.es"] = cohens_d(df4mod[df4mod$subgrp=="SC_equal_RRB",y_var],
                                                                 df4mod[df4mod$subgrp=="SC_over_RRB",y_var])

  #----------------------------------------------------------------------------
  # Replication Bayes Factor
  res_bf = BFSALL(tobs = output_res[y_var,"SCequalRRB_vs_SCoverRRB_Disc.tstat"],
                  trep = output_res[y_var,"SCequalRRB_vs_SCoverRRB_Disc.tstat"],
                  n1 = n1,
                  n2 = n2,
                  m1 = m1,
                  m2 = m2,
                  sample = 2,
                  Type = 'ALL')
  output_res[y_var,"SCequalRRB_vs_SCoverRRB.repBF"] = res_bf["Replication BF","Replication 1"]

  # make a plot
  colors2use = get_ggColorHue(3)
  df4plot = subset(df2use, df2use$subgrp!="RRB_over_SC")
  p = ggplot(data = df4plot, aes_string(x = "subgrp", y = y_var, colour = "subgrp")) + facet_grid(. ~ dataset)
  p = p + geom_jitter() + geom_boxplot(fill = NA, colour = "#000000", outlier.shape = NA)
  p = p + ylab(vnames[ivar]) + xlab("Group") +
    scale_x_discrete(labels=c("SC_equal_RRB"="SC=RRB","SC_over_RRB"="SC>RRB","TD"="TD")) +
                       scale_colour_manual(values = c(colors2use[2:3],"grey60")) + guides(colour=FALSE) +
    theme(text = element_text(size=fontSize-5),
        axis.text.x = element_text(size=fontSize-5),
        axis.text.y = element_text(size=fontSize-5))
  print(p)

}

vabc["0.6","Discovery"] = output_res["vabsabcabc_standard","SCequalRRB_vs_SCoverRRB_Disc.es"]
vabc["0.6","Replication"] = output_res["vabsabcabc_standard","SCequalRRB_vs_SCoverRRB_Rep.es"]
vabc_dls["0.6","Discovery"] = output_res["vabsdscoresd_dss","SCequalRRB_vs_SCoverRRB_Disc.es"]
vabc_dls["0.6","Replication"] = output_res["vabsdscoresd_dss","SCequalRRB_vs_SCoverRRB_Rep.es"]
output_res
##                         All_Disc.fstat All_Disc.pval All_Rep.fstat All_Rep.pval
## age                          0.2764449  8.423702e-01     0.5333990 6.597850e-01
## meanFD                       2.8411213  3.849901e-02     0.2732541 8.446636e-01
## viq_all                      2.0606131  1.060655e-01     1.9418668 1.234238e-01
## piq_all                      1.5443272  2.036586e-01     1.0637207 3.651305e-01
## fsiq4_all                    2.2139914  8.707432e-02     1.4197600 2.375609e-01
## A_pct_severity              31.1339556  9.934498e-12    44.9753913 1.776357e-15
## B_pct_severity              29.4477380  3.111467e-11     4.2452557 1.641808e-02
## ADI_social_total             1.2910958  2.785534e-01     6.5424344 1.975574e-03
## ADI_communication_total      0.4871784  6.155008e-01    10.4729563 6.165916e-05
## ADI_RRB_total               28.6351580  5.433998e-11     0.2482669 7.805294e-01
## ados_2_SA_CSS                2.5786048  7.994439e-02     2.3961729 9.533984e-02
## ados_2_RRB_CSS               0.7229015  4.873759e-01     0.2605255 7.710741e-01
## SRS_tscore_self             36.7695315  0.000000e+00    32.9427493 1.776357e-15
## RBS_total                   18.4871548  1.991074e-10    13.3371250 7.360727e-08
## SSP_total                   30.4728775  5.551115e-15    22.8487987 6.286305e-12
## vabsdscoress_dss            23.7174717  1.278755e-12    26.2420515 9.459100e-14
## vabsdscoresd_dss            12.2686164  3.219601e-07    10.2000437 3.685760e-06
## vabsdscoresc_dss             8.9789554  1.655782e-05     8.8683692 1.864441e-05
## vabsabcabc_standard         17.6024053  7.799775e-10    17.7427118 6.433748e-10
##                         SCequalRRB_vs_SCoverRRB_Disc.fstat
## age                                           1.208723e-04
## meanFD                                        1.613557e+00
## viq_all                                       1.573745e-02
## piq_all                                       3.467867e-02
## fsiq4_all                                     8.884585e-04
## A_pct_severity                                2.623303e+01
## B_pct_severity                                4.267055e+01
## ADI_social_total                              4.106023e-01
## ADI_communication_total                       4.859287e-01
## ADI_RRB_total                                 3.085261e+01
## ados_2_SA_CSS                                 4.391497e-01
## ados_2_RRB_CSS                                6.978527e-01
## SRS_tscore_self                               1.357964e+00
## RBS_total                                     1.031475e-01
## SSP_total                                     3.557144e-01
## vabsdscoress_dss                              4.187447e+00
## vabsdscoresd_dss                              3.343510e-01
## vabsdscoresc_dss                              3.553608e-03
## vabsabcabc_standard                           5.307453e-01
##                         SCequalRRB_vs_SCoverRRB_Disc.tstat
## age                                             0.01099419
## meanFD                                         -1.27025873
## viq_all                                         0.12544898
## piq_all                                        -0.18622209
## fsiq4_all                                      -0.02980702
## A_pct_severity                                  5.12181909
## B_pct_severity                                 -6.53227014
## ADI_social_total                                0.64078259
## ADI_communication_total                         0.69708587
## ADI_RRB_total                                  -5.55451285
## ados_2_SA_CSS                                   0.66268368
## ados_2_RRB_CSS                                 -0.83537579
## SRS_tscore_self                                 1.16531709
## RBS_total                                       0.32116585
## SSP_total                                       0.59641799
## vabsdscoress_dss                               -2.04632533
## vabsdscoresd_dss                               -0.57823091
## vabsdscoresc_dss                               -0.05961215
## vabsabcabc_standard                            -0.72852266
##                         SCequalRRB_vs_SCoverRRB_Disc.pval
## age                                          9.912471e-01
## meanFD                                       2.065565e-01
## viq_all                                      9.003910e-01
## piq_all                                      8.526045e-01
## fsiq4_all                                    9.762726e-01
## A_pct_severity                               1.230234e-06
## B_pct_severity                               1.838366e-09
## ADI_social_total                             5.229388e-01
## ADI_communication_total                      4.871559e-01
## ADI_RRB_total                                1.820225e-07
## ados_2_SA_CSS                                5.088946e-01
## ados_2_RRB_CSS                               4.052847e-01
## SRS_tscore_self                              2.495322e-01
## RBS_total                                    7.487757e-01
## SSP_total                                    5.528764e-01
## vabsdscoress_dss                             4.313077e-02
## vabsdscoresd_dss                             5.643120e-01
## vabsdscoresc_dss                             9.525737e-01
## vabsabcabc_standard                          4.678705e-01
##                         SCequalRRB_vs_SCoverRRB_Disc.es
## age                                       -0.0020486507
## meanFD                                     0.2366987382
## viq_all                                   -0.0531943418
## piq_all                                    0.0370911813
## fsiq4_all                                  0.0034257024
## A_pct_severity                            -0.7647298789
## B_pct_severity                             1.1922722865
## ADI_social_total                          -0.0261197588
## ADI_communication_total                    0.0354493213
## ADI_RRB_total                              1.0690358682
## ados_2_SA_CSS                             -0.0951001837
## ados_2_RRB_CSS                             0.1585085466
## SRS_tscore_self                           -0.0638805495
## RBS_total                                 -0.0013559914
## SSP_total                                 -0.1555410869
## vabsdscoress_dss                           0.2696617895
## vabsdscoresd_dss                           0.0523634891
## vabsdscoresc_dss                           0.0000932107
## vabsabcabc_standard                        0.1137339593
##                         SCequalRRB_vs_SCoverRRB_Rep.fstat
## age                                             0.1025825
## meanFD                                          1.2269701
## viq_all                                         1.7164429
## piq_all                                         1.8155303
## fsiq4_all                                       1.5264451
## A_pct_severity                                 71.9443659
## B_pct_severity                                  4.1152326
## ADI_social_total                               11.2111628
## ADI_communication_total                        12.7860799
## ADI_RRB_total                                   0.1987732
## ados_2_SA_CSS                                   3.0812381
## ados_2_RRB_CSS                                  0.4605097
## SRS_tscore_self                                 0.9242800
## RBS_total                                       3.1219435
## SSP_total                                       0.1438166
## vabsdscoress_dss                               20.5316013
## vabsdscoresd_dss                                8.7578258
## vabsdscoresc_dss                               10.7802375
## vabsabcabc_standard                            22.1882695
##                         SCequalRRB_vs_SCoverRRB_Rep.tstat
## age                                            -0.3202850
## meanFD                                          1.1076868
## viq_all                                        -1.3101309
## piq_all                                        -1.3474162
## fsiq4_all                                      -1.2354939
## A_pct_severity                                  8.4820025
## B_pct_severity                                 -2.0286036
## ADI_social_total                                3.3483075
## ADI_communication_total                         3.5757628
## ADI_RRB_total                                  -0.4458398
## ados_2_SA_CSS                                   1.7553456
## ados_2_RRB_CSS                                 -0.6786086
## SRS_tscore_self                                 0.9613948
## RBS_total                                       1.7669022
## SSP_total                                      -0.3792315
## vabsdscoress_dss                               -4.5311810
## vabsdscoresd_dss                               -2.9593624
## vabsdscoresc_dss                               -3.2833272
## vabsabcabc_standard                            -4.7104426
##                         SCequalRRB_vs_SCoverRRB_Rep.pval
## age                                         7.493002e-01
## meanFD                                      2.701762e-01
## viq_all                                     1.926962e-01
## piq_all                                     1.803849e-01
## fsiq4_all                                   2.190803e-01
## A_pct_severity                              6.128431e-14
## B_pct_severity                              4.467566e-02
## ADI_social_total                            1.081815e-03
## ADI_communication_total                     5.014772e-04
## ADI_RRB_total                               6.565032e-01
## ados_2_SA_CSS                               8.181752e-02
## ados_2_RRB_CSS                              4.987259e-01
## SRS_tscore_self                             3.404866e-01
## RBS_total                                   8.015024e-02
## SSP_total                                   7.055875e-01
## vabsdscoress_dss                            1.497522e-05
## vabsdscoresd_dss                            3.782327e-03
## vabsdscoresc_dss                            1.375857e-03
## vabsabcabc_standard                         7.329985e-06
##                         SCequalRRB_vs_SCoverRRB_Rep.es
## age                                         0.05779712
## meanFD                                     -0.19988792
## viq_all                                     0.28830307
## piq_all                                     0.31291930
## fsiq4_all                                   0.29154308
## A_pct_severity                             -1.39474985
## B_pct_severity                              0.42698121
## ADI_social_total                           -0.54164153
## ADI_communication_total                    -0.56267039
## ADI_RRB_total                               0.14227907
## ados_2_SA_CSS                              -0.31500221
## ados_2_RRB_CSS                              0.10825686
## SRS_tscore_self                            -0.25035851
## RBS_total                                  -0.27897124
## SSP_total                                   0.07684736
## vabsdscoress_dss                            0.74280170
## vabsdscoresd_dss                            0.46132860
## vabsdscoresc_dss                            0.62106305
## vabsabcabc_standard                         0.77122945
##                         SCequalRRB_vs_SCoverRRB.repBF                varNames
## age                                      6.960942e-01                     age
## meanFD                                   1.553721e+00                  meanFD
## viq_all                                  7.004006e-01                 viq_all
## piq_all                                  7.079673e-01                 piq_all
## fsiq4_all                                6.959276e-01               fsiq4_all
## A_pct_severity                           1.074064e+05          A_pct_severity
## B_pct_severity                           6.884517e+07          B_pct_severity
## ADI_social_total                         8.545985e-01        ADI_social_total
## ADI_communication_total                  8.878127e-01 ADI_communication_total
## ADI_RRB_total                            7.048704e+05           ADI_RRB_total
## ados_2_SA_CSS                            8.666988e-01           ados_2_SA_CSS
## ados_2_RRB_CSS                           9.852443e-01          ados_2_RRB_CSS
## SRS_tscore_self                          1.370332e+00         SRS_tscore_self
## RBS_total                                7.325785e-01               RBS_total
## SSP_total                                8.321138e-01               SSP_total
## vabsdscoress_dss                         5.491854e+00        vabsdscoress_dss
## vabsdscoresd_dss                         8.226833e-01        vabsdscoresd_dss
## vabsdscoresc_dss                         6.967464e-01        vabsdscoresc_dss
## vabsabcabc_standard                      9.064006e-01     vabsabcabc_standard

SC-RRB difference z = 0.7

#------------------------------------------------------------------------------
# Z-score threshold to use for subtyping
z_thresh = 0.7

# vars2use = c("dbaes_atotal","dbaes_btotal")

# compute Discovery mean and sd
ds_disc = Dverbal_Discovery[,vars2use[1]] - Dverbal_Discovery[,vars2use[2]]
mean2use = mean(ds_disc)
sd2use = sd(ds_disc)

Dverbal_Discovery = make_subtype(data2use = Dverbal_Discovery,
                                 z_thresh = z_thresh,
                                 mean2use = mean2use,
                                 sd2use = sd2use)

# compute Replication mean and sd
ds_rep = Dverbal_Replication[,vars2use[1]] - Dverbal_Replication[,vars2use[2]]
mean2use = mean(ds_rep)
sd2use = sd(ds_rep)

Dverbal_Replication = make_subtype(data2use = Dverbal_Replication,
                                 z_thresh = z_thresh,
                                 mean2use = mean2use,
                                 sd2use = sd2use)

#------------------------------------------------------------------------------
# Table of subtypes X sex

# Discovery
table(Dverbal_Discovery$z_ds_group,Dverbal_Discovery$sex)
##               
##                  F   M
##   RRB_over_SC   42 169
##   SC_equal_RRB 112 365
##   SC_over_RRB   43 158
cs_res = chisq.test(Dverbal_Discovery$z_ds_group,Dverbal_Discovery$sex)
cs_res
## 
##  Pearson's Chi-squared test
## 
## data:  Dverbal_Discovery$z_ds_group and Dverbal_Discovery$sex
## X-squared = 1.1723, df = 2, p-value = 0.5565
# Replication
table(Dverbal_Replication$z_ds_group,Dverbal_Replication$sex)
##               
##                  F   M
##   RRB_over_SC   42 170
##   SC_equal_RRB 111 374
##   SC_over_RRB   43 150
cs_res = chisq.test(Dverbal_Replication$z_ds_group,Dverbal_Replication$sex)
cs_res
## 
##  Pearson's Chi-squared test
## 
## data:  Dverbal_Replication$z_ds_group and Dverbal_Replication$sex
## X-squared = 0.82192, df = 2, p-value = 0.663
#------------------------------------------------------------------------------
# Descriptive stats

# Discovery
df2use = Dverbal_Discovery[,c("subjectkey","dataset_id","collection_id","interview_age","sex","z_ds_group","ados_age","ados_sa_css","ados_rrb_css","iq","dbaes_atotal","dbaes_btotal")]
df2use$age = df2use$interview_age/12

cols2use = c("z_ds_group","sex","age","ados_age","ados_sa_css","ados_rrb_css","iq","dbaes_atotal","dbaes_btotal")
res=describeBy(x = df2use[,cols2use], group = "z_ds_group")
res
## 
##  Descriptive statistics by group 
## group: RRB_over_SC
##              vars   n   mean    sd median trimmed   mad   min    max  range
## z_ds_group*     1 211   1.00  0.00   1.00    1.00  0.00  1.00   1.00   0.00
## sex*            2 211    NaN    NA     NA     NaN    NA   Inf   -Inf   -Inf
## age             3 211   9.96  4.14   9.50    9.61  3.34  2.00  27.17  25.17
## ados_age        4  27  93.59 38.82  86.00   92.04 38.55 37.00 171.00 134.00
## ados_sa_css     5  27   6.52  2.38   7.00    6.61  2.97  2.00  10.00   8.00
## ados_rrb_css    6  27   7.59  2.08   8.00    7.83  1.48  1.00  10.00   9.00
## iq              7  55 103.16 16.12 105.00  103.76 14.83 54.00 139.00  85.00
## dbaes_atotal    8 211   0.22  0.11   0.22    0.22  0.12  0.00   0.51   0.51
## dbaes_btotal    9 211   0.47  0.12   0.46    0.47  0.13  0.14   0.79   0.65
##               skew kurtosis   se
## z_ds_group*    NaN      NaN 0.00
## sex*            NA       NA   NA
## age           0.97     1.63 0.28
## ados_age      0.43    -0.99 7.47
## ados_sa_css  -0.35    -1.07 0.46
## ados_rrb_css -1.22     1.58 0.40
## iq           -0.47     0.57 2.17
## dbaes_atotal  0.15    -0.64 0.01
## dbaes_btotal  0.08    -0.41 0.01
## ------------------------------------------------------------ 
## group: SC_equal_RRB
##              vars   n   mean    sd median trimmed   mad min    max  range  skew
## z_ds_group*     1 477   2.00  0.00   2.00    2.00  0.00   2   2.00   0.00   NaN
## sex*            2 477    NaN    NA     NA     NaN    NA Inf   -Inf   -Inf    NA
## age             3 477   9.13  5.62   8.00    8.34  4.82   0  45.75  45.75  1.98
## ados_age        4  81  80.81 45.01  64.00   75.60 38.55  27 202.00 175.00  0.78
## ados_sa_css     5  81   6.77  2.08   7.00    6.85  1.48   1  10.00   9.00 -0.33
## ados_rrb_css    6  81   7.65  2.35   8.00    8.06  1.48   1  10.00   9.00 -1.55
## iq              7 119 104.25 18.51 107.00  105.64 17.79  42 138.00  96.00 -0.82
## dbaes_atotal    8 477   0.30  0.13   0.30    0.30  0.14   0   0.67   0.67  0.00
## dbaes_btotal    9 477   0.31  0.13   0.31    0.31  0.14   0   0.68   0.68 -0.13
##              kurtosis   se
## z_ds_group*       NaN 0.00
## sex*               NA   NA
## age              6.55 0.26
## ados_age        -0.54 5.00
## ados_sa_css     -0.30 0.23
## ados_rrb_css     2.12 0.26
## iq               0.95 1.70
## dbaes_atotal    -0.34 0.01
## dbaes_btotal    -0.26 0.01
## ------------------------------------------------------------ 
## group: SC_over_RRB
##              vars   n   mean    sd median trimmed   mad   min    max  range
## z_ds_group*     1 201   3.00  0.00   3.00    3.00  0.00  3.00   3.00   0.00
## sex*            2 201    NaN    NA     NA     NaN    NA   Inf   -Inf   -Inf
## age             3 201   7.31  5.08   5.75    6.41  2.97  1.67  37.33  35.67
## ados_age        4  45  69.80 35.62  65.00   63.81 25.20 30.00 172.00 142.00
## ados_sa_css     5  45   7.51  1.55   7.00    7.54  1.48  4.00  10.00   6.00
## ados_rrb_css    6  45   8.18  1.70   8.00    8.35  1.48  1.00  10.00   9.00
## iq              7  25 104.00 19.93 111.00  105.29 13.34 40.00 140.00 100.00
## dbaes_atotal    8 201   0.48  0.13   0.47    0.47  0.14  0.16   0.87   0.71
## dbaes_btotal    9 201   0.22  0.10   0.22    0.22  0.10  0.00   0.57   0.57
##               skew kurtosis   se
## z_ds_group*    NaN      NaN 0.00
## sex*            NA       NA   NA
## age           2.44     8.14 0.36
## ados_age      1.54     1.89 5.31
## ados_sa_css  -0.04    -0.77 0.23
## ados_rrb_css -1.64     4.86 0.25
## iq           -1.08     2.07 3.99
## dbaes_atotal  0.21    -0.21 0.01
## dbaes_btotal  0.30    -0.02 0.01
# Replication
df2use = Dverbal_Replication[,c("subjectkey","dataset_id","collection_id","interview_age","sex","z_ds_group","ados_age","ados_sa_css","ados_rrb_css","iq","dbaes_atotal","dbaes_btotal")]
df2use$age = df2use$interview_age/12

cols2use = c("z_ds_group","sex","age","ados_age","ados_sa_css","ados_rrb_css","iq","dbaes_atotal","dbaes_btotal")
res=describeBy(x = df2use[,cols2use], group = "z_ds_group")
res
## 
##  Descriptive statistics by group 
## group: RRB_over_SC
##              vars   n   mean    sd median trimmed   mad   min    max  range
## z_ds_group*     1 212   1.00  0.00   1.00    1.00  0.00  1.00   1.00   0.00
## sex*            2 212    NaN    NA     NA     NaN    NA   Inf   -Inf   -Inf
## age             3 212   9.88  4.73   9.33    9.40  4.14  3.00  28.58  25.58
## ados_age        4  17  84.00 51.33  75.00   80.20 54.86 36.00 189.00 153.00
## ados_sa_css     5  17   6.53  2.21   7.00    6.53  2.97  3.00  10.00   7.00
## ados_rrb_css    6  17   7.18  1.51   7.00    7.13  1.48  5.00  10.00   5.00
## iq              7  63 103.48 18.70 105.00  104.24 14.83 57.00 152.00  95.00
## dbaes_atotal    8 212   0.23  0.11   0.22    0.22  0.11  0.01   0.61   0.60
## dbaes_btotal    9 212   0.49  0.13   0.48    0.48  0.11  0.21   0.93   0.72
##               skew kurtosis    se
## z_ds_group*    NaN      NaN  0.00
## sex*            NA       NA    NA
## age           1.18     2.07  0.32
## ados_age      0.64    -1.09 12.45
## ados_sa_css  -0.18    -1.35  0.54
## ados_rrb_css  0.33    -0.64  0.37
## iq           -0.30     0.52  2.36
## dbaes_atotal  0.63     0.74  0.01
## dbaes_btotal  0.48     0.77  0.01
## ------------------------------------------------------------ 
## group: SC_equal_RRB
##              vars   n   mean    sd median trimmed   mad min    max  range  skew
## z_ds_group*     1 485   2.00  0.00   2.00    2.00  0.00   2   2.00   0.00   NaN
## sex*            2 485    NaN    NA     NA     NaN    NA Inf   -Inf   -Inf    NA
## age             3 485   8.78  5.19   7.75    8.04  4.57   0  33.83  33.83  1.67
## ados_age        4  96  81.18 38.52  73.00   77.08 42.25  35 196.00 161.00  0.82
## ados_sa_css     5  96   6.85  2.03   7.00    6.90  1.48   2  10.00   8.00 -0.06
## ados_rrb_css    6  96   7.33  2.44   8.00    7.71  1.48   1  10.00   9.00 -1.32
## iq              7  92 106.80 15.98 108.00  106.42 14.08  64 146.00  82.00  0.11
## dbaes_atotal    8 485   0.31  0.14   0.31    0.31  0.13   0   0.78   0.78  0.06
## dbaes_btotal    9 485   0.32  0.14   0.32    0.32  0.14   0   0.81   0.81  0.11
##              kurtosis   se
## z_ds_group*       NaN 0.00
## sex*               NA   NA
## age              3.90 0.24
## ados_age         0.04 3.93
## ados_sa_css     -0.73 0.21
## ados_rrb_css     1.14 0.25
## iq              -0.01 1.67
## dbaes_atotal    -0.03 0.01
## dbaes_btotal     0.09 0.01
## ------------------------------------------------------------ 
## group: SC_over_RRB
##              vars   n   mean    sd median trimmed   mad   min    max  range
## z_ds_group*     1 193   3.00  0.00   3.00    3.00  0.00  3.00   3.00   0.00
## sex*            2 193    NaN    NA     NA     NaN    NA   Inf   -Inf   -Inf
## age             3 193   8.11  6.31   6.33    6.83  3.71  2.08  40.92  38.83
## ados_age        4  40  71.15 28.16  68.00   67.28 23.72 30.00 141.00 111.00
## ados_sa_css     5  40   7.25  1.84   7.00    7.28  1.48  3.00  10.00   7.00
## ados_rrb_css    6  40   7.53  2.23   8.00    7.78  1.48  1.00  10.00   9.00
## iq              7  31 111.52 18.36 115.00  112.60 16.31 62.00 146.00  84.00
## dbaes_atotal    8 193   0.49  0.13   0.49    0.48  0.13  0.14   0.96   0.82
## dbaes_btotal    9 193   0.23  0.12   0.23    0.23  0.13  0.00   0.50   0.50
##               skew kurtosis   se
## z_ds_group*    NaN      NaN 0.00
## sex*            NA       NA   NA
## age           2.21     5.59 0.45
## ados_age      1.04     0.50 4.45
## ados_sa_css  -0.07    -0.88 0.29
## ados_rrb_css -1.11     1.25 0.35
## iq           -0.60    -0.04 3.30
## dbaes_atotal  0.28     0.34 0.01
## dbaes_btotal  0.02    -0.66 0.01
#------------------------------------------------------------------------------
# Tests of differences in interview age across the subtypes

# Discovery
mod2use = lm(formula = interview_age ~ z_ds_group, data = Dverbal_Discovery)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: interview_age
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## z_ds_group   2  111058   55529  14.339 7.433e-07 ***
## Residuals  886 3431008    3872                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Replication
mod2use = lm(formula = interview_age ~ z_ds_group, data = Dverbal_Replication)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: interview_age
##             Df  Sum Sq Mean Sq F value  Pr(>F)   
## z_ds_group   2   48109 24054.6  5.8372 0.00303 **
## Residuals  887 3655255  4120.9                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#------------------------------------------------------------------------------
# Tests of differences in SC across the subtypes

# Discovery
mod2use = lm(formula = dbaes_atotal ~ z_ds_group, data = Dverbal_Discovery)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: dbaes_atotal
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## z_ds_group   2  7.1881  3.5940  213.96 < 2.2e-16 ***
## Residuals  886 14.8827  0.0168                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Replication
mod2use = lm(formula = dbaes_atotal ~ z_ds_group, data = Dverbal_Replication)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: dbaes_atotal
##             Df Sum Sq Mean Sq F value    Pr(>F)    
## z_ds_group   2  7.294  3.6470  212.52 < 2.2e-16 ***
## Residuals  887 15.221  0.0172                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#------------------------------------------------------------------------------
# Tests of differences in RRB across the subtypes

# Discovery
mod2use = lm(formula = dbaes_btotal ~ z_ds_group, data = Dverbal_Discovery)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: dbaes_btotal
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## z_ds_group   2  6.6145  3.3073  213.73 < 2.2e-16 ***
## Residuals  886 13.7103  0.0155                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Replication
mod2use = lm(formula = dbaes_btotal ~ z_ds_group, data = Dverbal_Replication)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: dbaes_btotal
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## z_ds_group   2  6.9404  3.4702  200.03 < 2.2e-16 ***
## Residuals  887 15.3882  0.0173                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#------------------------------------------------------------------------------
# Tests of differences in ADOS SA CSS across the subtypes

# Discovery
mod2use = lm(formula = ados_sa_css ~ z_ds_group, data = Dverbal_Discovery)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: ados_sa_css
##             Df Sum Sq Mean Sq F value  Pr(>F)  
## z_ds_group   2  21.94 10.9711  2.7587 0.06659 .
## Residuals  150 596.53  3.9769                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Replication
mod2use = lm(formula = ados_sa_css ~ z_ds_group, data = Dverbal_Replication)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: ados_sa_css
##             Df Sum Sq Mean Sq F value Pr(>F)
## z_ds_group   2   7.37  3.6826  0.9211 0.4003
## Residuals  150 599.69  3.9980
#------------------------------------------------------------------------------
# Tests of differences in ADOS RRB CSS across the subtypes

# Discovery
mod2use = lm(formula = ados_rrb_css ~ z_ds_group, data = Dverbal_Discovery)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: ados_rrb_css
##             Df Sum Sq Mean Sq F value Pr(>F)
## z_ds_group   2   9.30  4.6508  1.0268 0.3607
## Residuals  150 679.42  4.5294
# Replication
mod2use = lm(formula = ados_rrb_css ~ z_ds_group, data = Dverbal_Replication)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: ados_rrb_css
##             Df Sum Sq Mean Sq F value Pr(>F)
## z_ds_group   2   1.72  0.8622  0.1625 0.8502
## Residuals  150 795.78  5.3052
#------------------------------------------------------------------------------
# Tests of differences in IQ across the subtypes

# Discovery
mod2use = lm(formula = iq ~ z_ds_group, data = Dverbal_Discovery)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: iq
##             Df Sum Sq Mean Sq F value Pr(>F)
## z_ds_group   2     45   22.37  0.0685 0.9338
## Residuals  196  63984  326.45
# Replication
mod2use = lm(formula = iq ~ z_ds_group, data = Dverbal_Replication)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: iq
##             Df Sum Sq Mean Sq F value Pr(>F)
## z_ds_group   2   1364  682.15   2.268 0.1064
## Residuals  183  55042  300.78
#------------------------------------------------------------------------------
# Make scatterplots with difference score Z subtypes in different colors
maxScores = c(3,4)

p_disc = ggplot(data = Dverbal_Discovery, aes(x = dbaes_atotal, y = dbaes_btotal, colour = z_ds_group)) + geom_point() + xlab("SC") + ylab("RRB") + ylim(0,1) + xlim(0,1) + ggtitle("NDAR Discovery")
p1_top_left = p_disc + guides(colour=FALSE)
ggsave(filename = file.path(plotpath, sprintf("final_NDAR_Disc_scatterplot_z%s.pdf",as.character(z_thresh))), plot = p1_top_left)
p_disc

table(Dverbal_Discovery$z_ds_group)
## 
##  RRB_over_SC SC_equal_RRB  SC_over_RRB 
##          211          477          201
cor_res = cor.test(Dverbal_Discovery$dbaes_atotal, Dverbal_Discovery$dbaes_btotal)
cor_res
## 
##  Pearson's product-moment correlation
## 
## data:  Dverbal_Discovery$dbaes_atotal and Dverbal_Discovery$dbaes_btotal
## t = 6.6829, df = 887, p-value = 4.134e-11
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1554332 0.2806578
## sample estimates:
##       cor 
## 0.2189469
p_rep = ggplot(data = Dverbal_Replication, aes(x = dbaes_atotal, y = dbaes_btotal, colour = z_ds_group)) + geom_point() + xlab("SC") + ylab("RRB") + ylim(0,1) + xlim(0,1) +  ggtitle("NDAR Replication")
p2_bottom_left = p_rep + guides(colour=FALSE)
ggsave(filename = file.path(plotpath, sprintf("final_NDAR_Rep_scatterplot_z%s.pdf",as.character(z_thresh))), plot = p2_bottom_left)
p_rep

table(Dverbal_Replication$z_ds_group)
## 
##  RRB_over_SC SC_equal_RRB  SC_over_RRB 
##          212          485          193
cor_res = cor.test(Dverbal_Replication$dbaes_atotal, Dverbal_Replication$dbaes_btotal)
cor_res
## 
##  Pearson's product-moment correlation
## 
## data:  Dverbal_Replication$dbaes_atotal and Dverbal_Replication$dbaes_btotal
## t = 7.1459, df = 888, p-value = 1.864e-12
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1700824 0.2943934
## sample estimates:
##       cor 
## 0.2331903
#------------------------------------------------------------------------------
# Run supervised model with Discovery as training and Replication as Test

# run validation
# make subtypes using z-scores computed from the mean and sd of the training set
train_data = Dverbal_Discovery
test_data = Dverbal_Replication

mean2use = mean(train_data[,vars2use[1]] - train_data[,vars2use[2]])
sd2use = sd(train_data[,vars2use[1]] - train_data[,vars2use[2]])
tmp_train = make_subtype(data2use = train_data,
                         z_thresh = z_thresh,
                         mean2use = mean2use,
                         sd2use = sd2use)

mean2use = mean(test_data[,vars2use[1]] - test_data[,vars2use[2]])
sd2use = sd(test_data[,vars2use[1]] - test_data[,vars2use[2]])
tmp_test = make_subtype(data2use = test_data,
                        z_thresh = z_thresh,
                        mean2use = mean2use,
                        sd2use = sd2use)

#==============================================================================
#==============================================================================
# make labels based on train mean and sd
mean2use = mean(train_data[,vars2use[1]] - train_data[,vars2use[2]])
sd2use = sd(train_data[,vars2use[1]] - train_data[,vars2use[2]])
train_mean = mean2use
train_sd = sd2use

pred_labels = make_subtype(data2use = test_data,
                        z_thresh = z_thresh,
                        mean2use = train_mean,
                        sd2use = train_sd)
confmat = table(tmp_test$z_ds_group,pred_labels$z_ds_group)
acc = (confmat[1,1]+confmat[2,2]+confmat[3,3])/dim(pred_labels)[1]

# # compute model
# mod2use = svm(x = tmp_train[,vars2use], y = tmp_train$z_ds_group)
# pred_labels = predict(mod2use, tmp_test[,vars2use])
# confmat = table(tmp_test$z_ds_group,pred_labels)
# acc = (confmat[1,1]+confmat[2,2]+confmat[3,3])/length(pred_labels)
#==============================================================================
#==============================================================================

# plot confusion matrix
setHook("grid.newpage", function() pushViewport(viewport(x=1,y=1,width=0.9, height=0.9, name="vp", just=c("right","top"))), action="prepend")
pheatmap(confmat/rowSums(confmat)*100, display_numbers = confmat, color = colorRampPalette(c('white','red'))(100), cluster_rows = FALSE, cluster_cols = FALSE, fontsize_number = fontSize, fontsize_row = fontSize, fontsize_col = fontSize,labels_row = c("RRB>SC","SC=RRB","SC>RRB"),labels_col = c("RRB>SC","SC=RRB","SC>RRB"),angle_col=90,
         breaks= seq(0,100, length=100))
setHook("grid.newpage", NULL, "replace")
grid::grid.text("Actual Labels", y=-0.07, gp=gpar(fontsize=fontSize))
grid::grid.text("Predicted Labels", x=-0.07, rot=90, gp=gpar(fontsize=fontSize))

# show accuracy
true_accuracy = acc
true_accuracy
## [1] 0.9820225
#================================================================================
#================================================================================
# #------------------------------------------------------------------------------
# # Permute subtype labels to examine how well supervised model performs
#
# # nperm = 10000
#
# # make subtypes using z-scores computed from the mean and sd of the training set
# train_data = Dverbal_Discovery
# test_data = Dverbal_Replication
# mean2use = mean(train_data[,vars2use[1]] - train_data[,vars2use[2]])
# sd2use = sd(train_data[,vars2use[1]] - train_data[,vars2use[2]])
# tmp_train = make_subtype(data2use = train_data,
#                          z_thresh = z_thresh,
#                          mean2use = mean2use,
#                          sd2use = sd2use)
#
# mean2use = mean(test_data[,vars2use[1]] - test_data[,vars2use[2]])
# sd2use = sd(test_data[,vars2use[1]] - test_data[,vars2use[2]])
# tmp_test = make_subtype(data2use = test_data,
#                         z_thresh = z_thresh,
#                         mean2use = mean2use,
#                         sd2use = sd2use)
#
# acc = vector(length = nperm)
# for (iperm in 1:nperm){
#   # set seed for reproducibility
#   set.seed(iperm)
#
#   sc_perm = sample(train_data[,vars2use[1]])
#   rrb_perm = sample(train_data[,vars2use[2]])
#   perm_mean2use = mean(sc_perm - rrb_perm)
#   perm_sd2use = sd(sc_perm - rrb_perm)
#   # perm_mean2use = mean(train_data[,vars2use[1]] - rrb_perm)
#   # perm_sd2use = sd(train_data[,vars2use[1]] - rrb_perm)
#   pred_labels = make_subtype(data2use = tmp_test,
#                         z_thresh = z_thresh,
#                         mean2use = perm_mean2use,
#                         sd2use = perm_sd2use)
#   confmat = table(tmp_test$z_ds_group,pred_labels$z_ds_group)
#   acc = (confmat[1,1]+confmat[2,2]+confmat[3,3])/dim(pred_labels)[1]
#
#   # compute model
#   permuted_labels = sample(tmp_train$z_ds_group)
#   mod2use = svm(x = tmp_train[,vars2use], y = permuted_labels)
#   pred_labels = predict(mod2use, tmp_test[,vars2use])
#   confmat = table(tmp_test$z_ds_group,pred_labels)
#   acc[iperm] = (confmat[1,1]+confmat[2,2]+confmat[3,3])/length(pred_labels)
#   #============================================================================
# } # for (iperm in 1:nperm)
#
# df2plot = data.frame(Accuracy = acc)
# p = ggplot(data = df2plot, aes(x = Accuracy)) + geom_histogram() + geom_vline(xintercept=true_accuracy)
# p
#
# # compute p-value
# pval = sum(c(true_accuracy,acc)>=true_accuracy)/(nperm+1)
# pval
#================================================================================
#================================================================================

#------------------------------------------------------------------------------
# Plot difference score Z subtypes in Discovery set
maxScores = c(3,4)

# Discovery - make plot with all individuals shown as lines
df2use = Dverbal_Discovery[,c("subjectkey",adi_total_vars2use)]
df2use$subgrp = factor(tmp_train$z_ds_group)
df2use = data.frame(df2use)
df2use$subjectkey = factor(df2use$subjectkey)
df2use$SC = df2use$dbaes_atotal
df2use$RRB = df2use$dbaes_btotal

df4plot = melt(df2use,
               id.vars = c("subjectkey","subgrp"),
               measure.vars = c("SC","RRB"))

p = ggplot(data = df4plot, aes(x = variable,
                               y = value,
                               colour = subgrp,
                               group = subjectkey)) + facet_grid(. ~ subgrp)
p = p + geom_point(shape=1) + geom_line(alpha = 0.2) + ylim(0,1) + guides(color=FALSE)
p = p + ylab("Percent Severity") + xlab("ADI-R subscale")
p3_middle_top = p + guides(colour=FALSE)
ggsave(filename = file.path(plotpath, sprintf("final_NDAR_Disc_jitterplot_z%s.pdf",as.character(z_thresh))), plot = p3_middle_top)
p

# Plot difference score Z subtypes in Replication set
maxScores = c(3,4)

# Replication - make plot with all individuals shown as lines
df2use = Dverbal_Replication[,c("subjectkey",adi_total_vars2use)]
df2use$subgrp = factor(tmp_test$z_ds_group)
df2use = data.frame(df2use)
df2use$subjectkey = factor(df2use$subjectkey)
df2use$SC = df2use$dbaes_atotal
df2use$RRB = df2use$dbaes_btotal

df4plot = melt(df2use,
               id.vars = c("subjectkey","subgrp"),
               measure.vars = c("SC","RRB"))

p = ggplot(data = df4plot, aes(x = variable,
                               y = value,
                               colour = subgrp,
                               group = subjectkey)) + facet_grid(. ~ subgrp)
p = p + geom_point(shape=1) + geom_line(alpha = 0.2) + ylim(0,1) + guides(color=FALSE)
p = p + ylab("Percent Severity") + xlab("ADI-R subscale")
p4_middle_bottom = p + guides(colour=FALSE)
ggsave(filename = file.path(plotpath, sprintf("final_NDAR_Rep_jitterplot_z%s.pdf",as.character(z_thresh))), plot = p4_middle_bottom)
p

#------------------------------------------------------------------------------
# Apply NDAR subtypes to EU-AIMS LEAP data

# make SC and RRB percentages since EU-AIMS data is specified as percentages
Dverbal = read.csv(file.path(datapath,"tidy_verbal.csv"))
euaims_data = read.csv(file.path(datapath,"tidy_euaims.csv"))
mask1 = euaims_data$Diagnosis=="ASD"
mask2 =  (is.na(euaims_data$A1_pct_severity) | is.na(euaims_data$A2_pct_severity) | is.na(euaims_data$A3_pct_severity) | is.na(euaims_data$B1_pct_severity) | is.na(euaims_data$B2_pct_severity) | is.na(euaims_data$B3_pct_severity) | is.na(euaims_data$B4_pct_severity))
euaims_data = subset(euaims_data, (mask1 & !mask2))

Dverbal[,vars2use[1]] = Dverbal[,vars2use[1]]
Dverbal[,vars2use[2]] = Dverbal[,vars2use[2]]
euaims_data[,vars2use[1]] = (euaims_data$A1_pct_severity +
                               euaims_data$A2_pct_severity +
                               euaims_data$A3_pct_severity)/3
euaims_data[,vars2use[2]] = (euaims_data$B1_pct_severity +
                               euaims_data$B2_pct_severity +
                               euaims_data$B3_pct_severity +
                               euaims_data$B4_pct_severity)/4

train_data = Dverbal
test_data = euaims_data

mean2use = mean(train_data[,vars2use[1]] - train_data[,vars2use[2]], na.rm=TRUE)
sd2use = sd(train_data[,vars2use[1]] - train_data[,vars2use[2]], na.rm=TRUE)
c(mean2use, sd2use)
## [1] -0.01045243  0.19482749
tmp_train = make_subtype(data2use = train_data,
                         z_thresh = z_thresh,
                         mean2use = mean2use,
                         sd2use = sd2use)

tmp_test = make_subtype(data2use = test_data,
                        z_thresh = z_thresh,
                        mean2use = mean2use,
                        sd2use = sd2use)

#===========================================================================
#===========================================================================
# # compute model
# mod2use = svm(x = tmp_train[,vars2use], y = tmp_train$z_ds_group)
# pred_labels = predict(mod2use, tmp_test[,vars2use])
# confmat = table(tmp_test$z_ds_group,pred_labels)
# acc = (confmat[1,1]+confmat[2,2]+confmat[3,3])/length(pred_labels)
#
# tmp_test$svm_pred_labels = pred_labels
#
# # plot confusion matrix
# setHook("grid.newpage", function() pushViewport(viewport(x=1,y=1,width=0.9, height=0.9, name="vp", just=c("right","top"))), action="prepend")
# pheatmap(confmat/rowSums(confmat)*100, display_numbers = confmat, color = colorRampPalette(c('white','red'))(100), cluster_rows = FALSE, cluster_cols = FALSE, fontsize_number = fontSize, fontsize_row = fontSize, fontsize_col = fontSize,labels_row = c("RRB>SC","SC=RRB","SC>RRB"),labels_col = c("RRB>SC","SC=RRB","SC>RRB"),angle_col=90,
#          breaks= seq(0,100, length=100))
# setHook("grid.newpage", NULL, "replace")
# grid::grid.text("Actual Labels", y=-0.07, gp=gpar(fontsize=fontSize))
# grid::grid.text("Predicted Labels", x=-0.07, rot=90, gp=gpar(fontsize=fontSize))
#===========================================================================
#===========================================================================


# scatterplot
p1 = ggplot(data = tmp_train, aes(x = dbaes_atotal, y = dbaes_btotal, colour = factor(z_ds_group))) + geom_point() + xlab("Social-Communication") + ylab("Restricted Repetitive Behaviors") + ylim(0,0.8) + xlim(0,0.8) + ggtitle("NDAR ALL")
p1

p2 = ggplot(data = tmp_test, aes(x = dbaes_atotal, y = dbaes_btotal, colour = factor(z_ds_group))) + geom_point() + xlab("Social-Communication") + ylab("Restricted Repetitive Behaviors") + ylim(0,0.8) + xlim(0,0.8) + ggtitle("EU-AIMS with Groups from NDAR ALL")
p2

#===========================================================================
#===========================================================================
# p3 = ggplot(data = tmp_test, aes(x = dbaes_atotal, y = dbaes_btotal, colour = factor(pred_labels))) + geom_point() + xlab("SC") + ylab("RRB") + ylim(0,0.8) + xlim(0,0.8) + ggtitle("EU-AIMS")
# p5_bottom_right = p3 + guides(colour=FALSE)
# ggsave(filename = file.path(plotpath, sprintf("final_EUAIMS_scatterplot_z%s.pdf",as.character(z_thresh))), plot = p5_bottom_right)
# p3
#===========================================================================
#===========================================================================

# # write out EU-AIMS LEAP data with subgroups defined by NDAR All
write.csv(tmp_test, file.path(datapath, sprintf("tidy_euaims_NDAR_subtypes_diffscore_z%s.csv",as.character(z_thresh))))

#===========================================================================
#===========================================================================
# # Make final plot
# p_final = p1_top_left + p3_middle_top + plot_spacer() + p2_bottom_left + p4_middle_bottom + p5_bottom_right + plot_layout(nrow=3, ncol=3, widths = c(4,4,4), heights = c(8,8,8))
# ggsave(filename = file.path(plotpath, sprintf("final_NDAR_EUAIMS_subtypes_plot_z%s.pdf",as.character(z_thresh))), plot = p_final)
# p_final
#===========================================================================
#===========================================================================

#------------------------------------------------------------------------------
# Integrate subtypes with rest of EU-AIMS LEAP data
euaims_data = read.csv(file.path(datapath,"tidy_euaims.csv"))
td_df = subset(euaims_data, euaims_data$Diagnosis=="TD",select=2:6)
td_df$A_pct_severity = as.numeric(NA)
td_df$B_pct_severity = as.numeric(NA)
td_df$subgrp = "TD"

#===========================================================================
#===========================================================================
# cols2use = c("subid","age","Centre","Schedule","Diagnosis","dbaes_atotal","dbaes_btotal","svm_pred_labels")
cols2use = c("subid","age","Centre","Schedule","Diagnosis","dbaes_atotal","dbaes_btotal","z_ds_group")
#===========================================================================
#===========================================================================
tmp_asd = tmp_test[,cols2use]
colnames(tmp_asd)[6] = "A_pct_severity"
colnames(tmp_asd)[7] = "B_pct_severity"
colnames(tmp_asd)[8] = "subgrp"
asd_df = tmp_asd

all_data = rbind(td_df,asd_df)

fname = "/Users/mlombardo/Dropbox/euaims/data/rsfmri_preproc/euaims_preproc.xlsx"
pp_data = read_excel(fname)
mask = pp_data$notes=="ok"
pp_data = subset(pp_data, mask)

asd_df = merge(pp_data[,c("subid","sex")],asd_df, by = "subid")
td_df = merge(pp_data[,c("subid","sex")],td_df, by = "subid")

all_data = rbind(td_df,asd_df)

data2write = merge(pp_data, all_data, by = "subid")
data2write$age = data2write$age.x
data2write$sex = data2write$sex.y
print(table(data2write$Schedule, data2write$subgrp))
##    
##     RRB_over_SC SC_equal_RRB SC_over_RRB TD
##   A           7           36          37 78
##   B           1           45          39 83
##   C           4           27          32 59
##   D           1           10          27 23
print(table(data2write$Centre, data2write$subgrp))
##                
##                 RRB_over_SC SC_equal_RRB SC_over_RRB TD
##   CAMBRIDGE               4           27          11 29
##   KINGS_COLLEGE           7           49          50 78
##   MANNHEIM                0            0           0 34
##   NIJMEGEN                2           35          56 64
##   UTRECHT                 0            7          18 38
print(table(data2write$sex, data2write$subgrp))
##         
##          RRB_over_SC SC_equal_RRB SC_over_RRB  TD
##   Female           5           30          36  88
##   Male             8           88          99 155
#DSM-5 - find best split that balances participants across sites
a = findBestSplit(asd_df, seed_range = c(172342)) #,300001:500000))
## [1] 172342
best_seeds = a$seed[!is.na(a$discrepancy) & a$discrepancy==min(a$discrepancy, na.rm = TRUE)]
print(best_seeds)
## [1] 172342
# Split datasets -------------------------------------------------------------
rngSeed = best_seeds[1]

# split Schedule A dataset
dset2use = subset(asd_df, asd_df$Schedule=="A")
tmp_d = SplitDatasetsBySex(dset2use, rngSeed = rngSeed)
A_Discovery = tmp_d[[2]]
A_Replication = tmp_d[[1]]

# split Schedule B dataset
dset2use = subset(asd_df, asd_df$Schedule=="B")
tmp_d = SplitDatasetsBySex(dset2use, rngSeed = rngSeed)
B_Discovery = tmp_d[[2]]
B_Replication = tmp_d[[1]]

# split Schedule C dataset
dset2use = subset(asd_df, asd_df$Schedule=="C")
tmp_d = SplitDatasetsBySex(dset2use, rngSeed = rngSeed)
C_Discovery = tmp_d[[2]]
C_Replication = tmp_d[[1]]

# split Schedule D dataset
dset2use = subset(asd_df, asd_df$Schedule=="D")
tmp_d = SplitDatasetsBySex(dset2use, rngSeed = rngSeed)
D_Discovery = tmp_d[[2]]
D_Replication = tmp_d[[1]]

df_Disc = rbind(A_Discovery, B_Discovery, C_Discovery, D_Discovery)
df_Rep = rbind(A_Replication, B_Replication, C_Replication, D_Replication)

a = table(df_Disc$Schedule, df_Disc$Centre); print(a)
##    
##     CAMBRIDGE KINGS_COLLEGE NIJMEGEN UTRECHT
##   A         6            17       10       7
##   B         9            16       14       3
##   C         6             9       14       2
##   D         0            10       10       0
b = table(df_Rep$Schedule, df_Rep$Centre); print(b)
##    
##     CAMBRIDGE KINGS_COLLEGE NIJMEGEN UTRECHT
##   A         7            18       10       5
##   B         8            17       13       5
##   C         6            10       13       3
##   D         0             9        9       0
print(a-b)
##    
##     CAMBRIDGE KINGS_COLLEGE NIJMEGEN UTRECHT
##   A        -1            -1        0       2
##   B         1            -1        1      -2
##   C         0            -1        1      -1
##   D         0             1        1       0
print(sum(rowSums(abs(a-b))))
## [1] 14
data2write$dataset = NA
mask  = is.element(data2write$subid,df_Disc$subid)
data2write[mask,"dataset"] = "Discovery"
mask  = is.element(data2write$subid,df_Rep$subid)
data2write[mask,"dataset"] = "Replication"

asd_Disc = subset(data2write, data2write$dataset=="Discovery" & data2write$Diagnosis=="ASD")
asd_Rep = subset(data2write, data2write$dataset=="Replication" & data2write$Diagnosis=="ASD")

# # find which seed gives best TD age-match -------------------------------------
# seeds = 1:1000
# pvals = data.frame(matrix(nrow = length(seeds), ncol = 2))
# for (i in 1:length(seeds)) {
#   res = findTDAgeMatch(data2write, seed_range = c(seeds[i],seeds[i]))
#   td_Disc_matched = res[[2]]
#   td_Rep_matched = res[[1]]
#   tres = t.test(td_Disc_matched$age, asd_Disc$age)
#   pvals[i,1] = tres$p.value
#   tres = t.test(td_Rep_matched$age, asd_Rep$age)
#   pvals[i,2] = tres$p.value
#   #print(i)
# }
# a = sort.int(pvals[,1], decreasing = TRUE, index.return = TRUE)
# b=pvals[a$ix,]

seed2use = 929
res = findTDAgeMatch(data2write, seed_range = c(seed2use,seed2use))
td_Disc_matched = res[[2]]
td_Rep_matched = res[[1]]

mask = is.element(data2write$subid, td_Disc_matched$subid)
data2write$dataset[mask] = "Discovery"

mask = is.element(data2write$subid, td_Rep_matched$subid)
data2write$dataset[mask] = "Replication"

print(table(data2write$dataset, data2write$subgrp))
##              
##               RRB_over_SC SC_equal_RRB SC_over_RRB  TD
##   Discovery            10           56          67 121
##   Replication           3           62          68 122
fname2save = here(sprintf("asd_subgrp_data_rsfmri_ALL_DSM5_diffzscoreGrps_z%s.csv",as.character(z_thresh)))
write.csv(data2write,file = fname2save)

#------------------------------------------------------------------------------
# Descriptive stats
df2use = data2write[,c("subid","Diagnosis","dataset","Centre","meanFD","age","sex","subgrp","viq_all","piq_all","fsiq4_all","A_pct_severity","B_pct_severity","ADI_social_total","ADI_communication_total","ADI_RRB_total","ados_2_SA_CSS","ados_2_RRB_CSS","SRS_tscore","SRS_tscore_self","RBS_total","SSP_total","vabsdscoresc_dss","vabsdscoresd_dss","vabsdscoress_dss","vabsabcabc_standard")]
mask = df2use==999 | df2use==777
df2use[mask] = NA
df2use$age = df2use$age/365

cols2use = c("dataset","subgrp","age","meanFD","viq_all","piq_all","fsiq4_all","A_pct_severity","B_pct_severity","ADI_social_total","ADI_communication_total","ADI_RRB_total","ados_2_SA_CSS","ados_2_RRB_CSS","SRS_tscore","SRS_tscore_self","RBS_total","SSP_total","vabsdscoresc_dss","vabsdscoresd_dss","vabsdscoress_dss","vabsabcabc_standard")
res=describeBy(x = df2use[,cols2use], group = c("subgrp","dataset"))
res
## 
##  Descriptive statistics by group 
## subgrp: RRB_over_SC
## dataset: Discovery
##                         vars  n   mean    sd median trimmed   mad    min    max
## dataset*                   1 10    NaN    NA     NA     NaN    NA    Inf   -Inf
## subgrp*                    2 10    NaN    NA     NA     NaN    NA    Inf   -Inf
## age                        3 10  18.01  6.59  21.73   18.53  3.30   7.89  24.03
## meanFD                     4 10   0.26  0.32   0.18    0.18  0.07   0.06   1.14
## viq_all                    5 10  98.49 16.82 101.00   98.50 15.57  73.00 123.85
## piq_all                    6 10  98.30 14.88  99.50  100.62 11.86  64.00 114.00
## fsiq4_all                  7 10  98.50 14.56  99.00   99.12 15.57  74.00 118.01
## A_pct_severity             8 10   0.20  0.10   0.16    0.19  0.10   0.05   0.35
## B_pct_severity             9 10   0.44  0.09   0.43    0.44  0.11   0.30   0.59
## ADI_social_total          10 10  17.50  7.53  20.50   18.00  7.41   5.00  26.00
## ADI_communication_total   11 10  15.60  5.89  16.00   15.75  7.41   6.00  24.00
## ADI_RRB_total             12 10   8.30  1.34   8.00    8.25  1.48   7.00  10.00
## ados_2_SA_CSS             13 10   4.50  2.99   3.50    4.38  3.71   1.00   9.00
## ados_2_RRB_CSS            14 10   3.90  3.78   1.00    3.62  0.00   1.00   9.00
## SRS_tscore                15  6  72.83 10.68  74.00   72.83  6.67  58.00  90.00
## SRS_tscore_self           16  4  59.00  7.12  61.00   59.00  4.45  49.00  65.00
## RBS_total                 17  5  18.20  7.66  19.00   18.20  5.93   8.00  29.00
## SSP_total                 18  4 146.25 21.41 149.50  146.25 20.02 119.00 167.00
## vabsdscoresc_dss          19  6  81.67 17.78  76.00   81.67 11.86  67.00 115.00
## vabsdscoresd_dss          20  6  69.33  8.50  72.50   69.33  5.93  57.00  79.00
## vabsdscoress_dss          21  6  71.50  8.48  72.00   71.50  5.93  57.00  82.00
## vabsabcabc_standard       22  6  72.33  9.69  71.00   72.33  6.67  59.00  88.00
##                         range  skew kurtosis    se
## dataset*                 -Inf    NA       NA    NA
## subgrp*                  -Inf    NA       NA    NA
## age                     16.14 -0.50    -1.72  2.08
## meanFD                   1.09  2.08     3.01  0.10
## viq_all                 50.85 -0.17    -1.52  5.32
## piq_all                 50.00 -0.99     0.13  4.71
## fsiq4_all               44.01 -0.24    -1.35  4.60
## A_pct_severity           0.30  0.18    -1.67  0.03
## B_pct_severity           0.30  0.09    -1.50  0.03
## ADI_social_total        21.00 -0.36    -1.66  2.38
## ADI_communication_total 18.00 -0.08    -1.43  1.86
## ADI_RRB_total            3.00  0.26    -1.85  0.42
## ados_2_SA_CSS            8.00  0.21    -1.77  0.95
## ados_2_RRB_CSS           8.00  0.40    -1.95  1.20
## SRS_tscore              32.00  0.20    -1.24  4.36
## SRS_tscore_self         16.00 -0.50    -1.88  3.56
## RBS_total               21.00  0.08    -1.58  3.43
## SSP_total               48.00 -0.24    -2.09 10.70
## vabsdscoresc_dss        48.00  0.94    -0.82  7.26
## vabsdscoresd_dss        22.00 -0.36    -1.80  3.47
## vabsdscoress_dss        25.00 -0.47    -1.15  3.46
## vabsabcabc_standard     29.00  0.27    -1.25  3.96
## ------------------------------------------------------------ 
## subgrp: SC_equal_RRB
## dataset: Discovery
##                         vars  n   mean    sd median trimmed   mad   min    max
## dataset*                   1 56    NaN    NA     NA     NaN    NA   Inf   -Inf
## subgrp*                    2 56    NaN    NA     NA     NaN    NA   Inf   -Inf
## age                        3 56  16.27  5.73  14.82   15.94  4.79  7.08  30.28
## meanFD                     4 56   0.30  0.52   0.19    0.22  0.13  0.04   3.95
## viq_all                    5 55  97.21 17.99  97.35   97.48 18.76 61.00 136.00
## piq_all                    6 55  99.94 19.47 102.00  100.41 19.19 61.00 142.00
## fsiq4_all                  7 56  98.83 17.79 102.61   99.44 19.37 60.00 131.00
## A_pct_severity             8 56   0.31  0.15   0.32    0.31  0.15  0.00   0.63
## B_pct_severity             9 56   0.32  0.16   0.29    0.31  0.16  0.01   0.69
## ADI_social_total          10 56  17.25  7.05  18.50   17.65  7.41  2.00  27.00
## ADI_communication_total   11 56  14.04  6.14  14.00   14.15  7.41  0.00  26.00
## ADI_RRB_total             12 56   5.55  2.43   5.00    5.54  2.97  0.00  12.00
## ados_2_SA_CSS             13 55   6.20  2.58   6.00    6.27  2.97  1.00  10.00
## ados_2_RRB_CSS            14 55   5.11  2.75   5.00    5.11  2.97  1.00  10.00
## SRS_tscore                15 50  71.96 11.78  74.00   72.20 13.34 47.00  90.00
## SRS_tscore_self           16 30  61.67 11.32  62.50   61.42 12.60 43.00  89.00
## RBS_total                 17 49  18.57 14.44  17.00   16.80 13.34  0.00  60.00
## SSP_total                 18 31 131.10 29.20 136.00  130.64 29.65 81.00 187.00
## vabsdscoresc_dss          19 55  72.45 18.89  75.00   73.24 13.34 21.00 122.00
## vabsdscoresd_dss          20 54  71.72 17.20  70.50   71.64 12.60 25.00 131.00
## vabsdscoress_dss          21 55  70.04 15.32  73.00   71.33 11.86 20.00  95.00
## vabsabcabc_standard       22 54  70.15 13.39  71.50   70.68  9.64 20.00 101.00
##                          range  skew kurtosis   se
## dataset*                  -Inf    NA       NA   NA
## subgrp*                   -Inf    NA       NA   NA
## age                      23.20  0.58    -0.54 0.77
## meanFD                    3.91  6.09    39.53 0.07
## viq_all                  75.00 -0.16    -0.65 2.43
## piq_all                  81.00 -0.25    -0.67 2.62
## fsiq4_all                71.00 -0.32    -0.70 2.38
## A_pct_severity            0.63 -0.04    -0.65 0.02
## B_pct_severity            0.68  0.35    -0.48 0.02
## ADI_social_total         25.00 -0.47    -0.91 0.94
## ADI_communication_total  26.00 -0.18    -0.77 0.82
## ADI_RRB_total            12.00  0.09    -0.19 0.33
## ados_2_SA_CSS             9.00 -0.25    -1.14 0.35
## ados_2_RRB_CSS            9.00 -0.33    -0.97 0.37
## SRS_tscore               43.00 -0.21    -1.08 1.67
## SRS_tscore_self          46.00  0.18    -0.78 2.07
## RBS_total                60.00  1.09     0.61 2.06
## SSP_total               106.00  0.11    -1.04 5.25
## vabsdscoresc_dss        101.00 -0.39     0.82 2.55
## vabsdscoresd_dss        106.00  0.29     2.14 2.34
## vabsdscoress_dss         75.00 -0.93     1.05 2.07
## vabsabcabc_standard      81.00 -0.78     2.70 1.82
## ------------------------------------------------------------ 
## subgrp: SC_over_RRB
## dataset: Discovery
##                         vars  n   mean    sd median trimmed   mad   min    max
## dataset*                   1 67    NaN    NA     NA     NaN    NA   Inf   -Inf
## subgrp*                    2 67    NaN    NA     NA     NaN    NA   Inf   -Inf
## age                        3 67  16.19  5.18  15.91   16.02  5.09  7.56  29.40
## meanFD                     4 67   0.24  0.23   0.15    0.19  0.10  0.03   1.08
## viq_all                    5 66  98.21 19.44 100.00   97.87 19.75 64.55 142.00
## piq_all                    6 66  99.48 22.94 102.49  100.22 21.34 52.43 150.00
## fsiq4_all                  7 67  98.93 19.61 102.00   99.37 19.27 59.00 143.00
## A_pct_severity             8 67   0.42  0.14   0.44    0.42  0.13  0.16   0.82
## B_pct_severity             9 67   0.17  0.10   0.15    0.16  0.11  0.00   0.40
## ADI_social_total          10 67  17.55  6.47  18.00   17.85  7.41  3.00  28.00
## ADI_communication_total   11 67  14.19  5.02  15.00   14.35  5.93  2.00  24.00
## ADI_RRB_total             12 67   2.97  1.98   3.00    2.87  1.48  0.00   8.00
## ados_2_SA_CSS             13 65   6.35  2.58   7.00    6.51  2.97  1.00  10.00
## ados_2_RRB_CSS            14 65   4.68  2.81   5.00    4.58  2.97  1.00  10.00
## SRS_tscore                15 58  72.90 12.17  74.00   73.56 13.34 44.00  95.00
## SRS_tscore_self           16 26  63.65 12.35  62.00   62.77  8.90 42.00  94.00
## RBS_total                 17 56  15.88 16.27  13.00   13.50 14.83  0.00  90.00
## SSP_total                 18 44 140.68 30.53 141.00  142.28 34.84 53.00 189.00
## vabsdscoresc_dss          19 62  71.60 15.81  72.00   72.72 10.38 21.00 104.00
## vabsdscoresd_dss          20 62  71.76 16.40  73.00   71.86 14.83 17.00 112.00
## vabsdscoress_dss          21 62  68.08 16.65  69.00   69.38 14.08 20.00 104.00
## vabsabcabc_standard       22 62  68.29 15.31  70.50   69.32 10.38  6.00 103.00
##                          range  skew kurtosis   se
## dataset*                  -Inf    NA       NA   NA
## subgrp*                   -Inf    NA       NA   NA
## age                      21.85  0.31    -0.44 0.63
## meanFD                    1.05  2.12     4.27 0.03
## viq_all                  77.45  0.13    -0.64 2.39
## piq_all                  97.57 -0.28    -0.54 2.82
## fsiq4_all                84.00 -0.23    -0.91 2.40
## A_pct_severity            0.66  0.28    -0.23 0.02
## B_pct_severity            0.40  0.33    -0.78 0.01
## ADI_social_total         25.00 -0.35    -0.84 0.79
## ADI_communication_total  22.00 -0.23    -0.60 0.61
## ADI_RRB_total             8.00  0.58    -0.33 0.24
## ados_2_SA_CSS             9.00 -0.51    -0.87 0.32
## ados_2_RRB_CSS            9.00 -0.17    -1.21 0.35
## SRS_tscore               51.00 -0.36    -0.39 1.60
## SRS_tscore_self          52.00  0.89     0.34 2.42
## RBS_total                90.00  2.03     6.03 2.17
## SSP_total               136.00 -0.51    -0.12 4.60
## vabsdscoresc_dss         83.00 -1.03     2.46 2.01
## vabsdscoresd_dss         95.00 -0.33     1.37 2.08
## vabsdscoress_dss         84.00 -0.77     0.84 2.11
## vabsabcabc_standard      97.00 -1.41     4.41 1.94
## ------------------------------------------------------------ 
## subgrp: TD
## dataset: Discovery
##                         vars   n   mean    sd median trimmed   mad    min
## dataset*                   1 121    NaN    NA     NA     NaN    NA    Inf
## subgrp*                    2 121    NaN    NA     NA     NaN    NA    Inf
## age                        3 121  16.83  5.23  16.65   16.73  5.69   7.22
## meanFD                     4 121   0.18  0.15   0.13    0.15  0.07   0.03
## viq_all                    5 119 104.52 19.70 105.00  105.03 19.27  46.00
## piq_all                    6 119 106.10 19.47 107.00  107.53 17.79  49.00
## fsiq4_all                  7 119 105.72 18.33 108.18  106.99 16.58  53.00
## A_pct_severity             8   0    NaN    NA     NA     NaN    NA    Inf
## B_pct_severity             9   0    NaN    NA     NA     NaN    NA    Inf
## ADI_social_total          10   0    NaN    NA     NA     NaN    NA    Inf
## ADI_communication_total   11   0    NaN    NA     NA     NaN    NA    Inf
## ADI_RRB_total             12   0    NaN    NA     NA     NaN    NA    Inf
## ados_2_SA_CSS             13   0    NaN    NA     NA     NaN    NA    Inf
## ados_2_RRB_CSS            14   0    NaN    NA     NA     NaN    NA    Inf
## SRS_tscore                15  68  47.84  9.40  45.00   46.32  5.19  37.00
## SRS_tscore_self           16  71  46.69  4.85  46.00   46.26  4.45  39.00
## RBS_total                 17  68   2.15  4.74   0.00    0.95  0.00   0.00
## SSP_total                 18  59 177.86 12.71 182.00  179.78  8.90 122.00
## vabsdscoresc_dss          19  34  91.97 25.44  99.50   93.50 21.50  21.00
## vabsdscoresd_dss          20  34  90.74 20.28  98.50   92.25 17.05  33.00
## vabsdscoress_dss          21  34  96.21 23.67 102.50   98.57 21.50  33.00
## vabsabcabc_standard       22  34  92.06 23.04  99.50   93.89 15.57  25.00
##                            max  range  skew kurtosis   se
## dataset*                  -Inf   -Inf    NA       NA   NA
## subgrp*                   -Inf   -Inf    NA       NA   NA
## age                      29.84  22.62  0.17    -0.51 0.48
## meanFD                    0.85   0.82  2.28     5.65 0.01
## viq_all                 160.00 114.00 -0.24     0.38 1.81
## piq_all                 147.00  98.00 -0.69     0.32 1.79
## fsiq4_all               142.00  89.00 -0.69     0.58 1.68
## A_pct_severity            -Inf   -Inf    NA       NA   NA
## B_pct_severity            -Inf   -Inf    NA       NA   NA
## ADI_social_total          -Inf   -Inf    NA       NA   NA
## ADI_communication_total   -Inf   -Inf    NA       NA   NA
## ADI_RRB_total             -Inf   -Inf    NA       NA   NA
## ados_2_SA_CSS             -Inf   -Inf    NA       NA   NA
## ados_2_RRB_CSS            -Inf   -Inf    NA       NA   NA
## SRS_tscore               76.00  39.00  1.54     1.44 1.14
## SRS_tscore_self          63.00  24.00  0.93     1.10 0.58
## RBS_total                27.00  27.00  3.13    10.87 0.57
## SSP_total               190.00  68.00 -1.90     4.85 1.66
## vabsdscoresc_dss        138.00 117.00 -0.71     0.36 4.36
## vabsdscoresd_dss        121.00  88.00 -0.80     0.07 3.48
## vabsdscoress_dss        129.00  96.00 -0.83    -0.31 4.06
## vabsabcabc_standard     127.00 102.00 -0.90     0.30 3.95
## ------------------------------------------------------------ 
## subgrp: RRB_over_SC
## dataset: Replication
##                         vars n   mean    sd median trimmed   mad    min    max
## dataset*                   1 3    NaN    NA     NA     NaN    NA    Inf   -Inf
## subgrp*                    2 3    NaN    NA     NA     NaN    NA    Inf   -Inf
## age                        3 3  14.54  4.39  12.61   14.54  1.72  11.45  19.56
## meanFD                     4 3   0.23  0.14   0.20    0.23  0.15   0.10   0.38
## viq_all                    5 3 111.00 28.00  99.00  111.00 11.86  91.00 143.00
## piq_all                    6 3 119.33 29.54 121.00  119.33 40.03  89.00 148.00
## fsiq4_all                  7 3 115.67 28.75 106.00  115.67 19.27  93.00 148.00
## A_pct_severity             8 3   0.20  0.07   0.16    0.20  0.01   0.15   0.27
## B_pct_severity             9 3   0.40  0.05   0.40    0.40  0.08   0.35   0.45
## ADI_social_total          10 3  15.67  2.52  16.00   15.67  2.97  13.00  18.00
## ADI_communication_total   11 3   8.67  2.52   9.00    8.67  2.97   6.00  11.00
## ADI_RRB_total             12 3   5.67  1.53   6.00    5.67  1.48   4.00   7.00
## ados_2_SA_CSS             13 3   4.67  2.52   5.00    4.67  2.97   2.00   7.00
## ados_2_RRB_CSS            14 3   4.33  3.06   5.00    4.33  2.97   1.00   7.00
## SRS_tscore                15 3  68.00  6.93  72.00   68.00  0.00  60.00  72.00
## SRS_tscore_self           16 1  67.00    NA  67.00   67.00  0.00  67.00  67.00
## RBS_total                 17 3  12.67  5.51  13.00   12.67  7.41   7.00  18.00
## SSP_total                 18 2 146.00  9.90 146.00  146.00 10.38 139.00 153.00
## vabsdscoresc_dss          19 3  82.33 14.43  74.00   82.33  0.00  74.00  99.00
## vabsdscoresd_dss          20 3  69.33  4.16  68.00   69.33  2.97  66.00  74.00
## vabsdscoress_dss          21 3  84.33  9.71  82.00   84.33  8.90  76.00  95.00
## vabsabcabc_standard       22 3  65.67 23.18  77.00   65.67  5.93  39.00  81.00
##                         range  skew kurtosis    se
## dataset*                 -Inf    NA       NA    NA
## subgrp*                  -Inf    NA       NA    NA
## age                      8.11  0.35    -2.33  2.53
## meanFD                   0.28  0.18    -2.33  0.08
## viq_all                 52.00  0.35    -2.33 16.17
## piq_all                 59.00 -0.06    -2.33 17.05
## fsiq4_all               55.00  0.30    -2.33 16.60
## A_pct_severity           0.12  0.38    -2.33  0.04
## B_pct_severity           0.11 -0.02    -2.33  0.03
## ADI_social_total         5.00 -0.13    -2.33  1.45
## ADI_communication_total  5.00 -0.13    -2.33  1.45
## ADI_RRB_total            3.00 -0.21    -2.33  0.88
## ados_2_SA_CSS            5.00 -0.13    -2.33  1.45
## ados_2_RRB_CSS           6.00 -0.21    -2.33  1.76
## SRS_tscore              12.00 -0.38    -2.33  4.00
## SRS_tscore_self          0.00    NA       NA    NA
## RBS_total               11.00 -0.06    -2.33  3.18
## SSP_total               14.00  0.00    -2.75  7.00
## vabsdscoresc_dss        25.00  0.38    -2.33  8.33
## vabsdscoresd_dss         8.00  0.29    -2.33  2.40
## vabsdscoress_dss        19.00  0.23    -2.33  5.61
## vabsabcabc_standard     42.00 -0.37    -2.33 13.38
## ------------------------------------------------------------ 
## subgrp: SC_equal_RRB
## dataset: Replication
##                         vars  n   mean    sd median trimmed   mad   min    max
## dataset*                   1 62    NaN    NA     NA     NaN    NA   Inf   -Inf
## subgrp*                    2 62    NaN    NA     NA     NaN    NA   Inf   -Inf
## age                        3 62  16.82  5.94  16.42   16.51  6.13  7.12  30.15
## meanFD                     4 62   0.23  0.23   0.18    0.19  0.11  0.05   1.60
## viq_all                    5 61 101.74 16.02 102.73  102.15 17.39 70.00 133.00
## piq_all                    6 61 103.30 18.99 105.98  104.48 19.31 52.00 134.00
## fsiq4_all                  7 61 102.84 17.03 105.00  103.57 17.79 64.00 131.00
## A_pct_severity             8 62   0.25  0.12   0.24    0.25  0.13  0.04   0.65
## B_pct_severity             9 62   0.24  0.13   0.23    0.23  0.11  0.00   0.67
## ADI_social_total          10 62  14.60  6.35  15.50   14.80  6.67  1.00  27.00
## ADI_communication_total   11 62  11.61  5.70  11.00   11.54  5.93  0.00  24.00
## ADI_RRB_total             12 62   4.00  2.32   4.00    3.96  2.22  0.00   9.00
## ados_2_SA_CSS             13 60   5.57  2.51   6.00    5.62  2.97  1.00  10.00
## ados_2_RRB_CSS            14 60   5.07  2.46   6.00    5.08  1.48  1.00  10.00
## SRS_tscore                15 56  65.89 11.23  67.00   65.72 12.60 43.00  90.00
## SRS_tscore_self           16 30  60.97  7.28  61.50   61.12  6.67 46.00  79.00
## RBS_total                 17 53  13.47 11.19  11.00   12.23 10.38  0.00  52.00
## SSP_total                 18 35 139.77 27.00 142.00  141.76 34.10 69.00 177.00
## vabsdscoresc_dss          19 56  82.18 14.63  80.00   81.50 14.83 50.00 122.00
## vabsdscoresd_dss          20 55  78.85 15.95  77.00   78.20 11.86 38.00 119.00
## vabsdscoress_dss          21 56  76.27 15.44  77.50   77.17 13.34 28.00 101.00
## vabsabcabc_standard       22 55  78.31 12.58  77.00   77.87  8.90 48.00 117.00
##                          range  skew kurtosis   se
## dataset*                  -Inf    NA       NA   NA
## subgrp*                   -Inf    NA       NA   NA
## age                      23.03  0.39    -0.66 0.75
## meanFD                    1.55  3.88    19.57 0.03
## viq_all                  63.00 -0.17    -0.92 2.05
## piq_all                  82.00 -0.54    -0.08 2.43
## fsiq4_all                67.00 -0.35    -0.63 2.18
## A_pct_severity            0.61  0.44     0.30 0.02
## B_pct_severity            0.67  0.69     0.92 0.02
## ADI_social_total         26.00 -0.24    -0.83 0.81
## ADI_communication_total  24.00  0.10    -0.71 0.72
## ADI_RRB_total             9.00  0.18    -0.60 0.29
## ados_2_SA_CSS             9.00 -0.21    -0.89 0.32
## ados_2_RRB_CSS            9.00 -0.45    -0.53 0.32
## SRS_tscore               47.00  0.09    -0.83 1.50
## SRS_tscore_self          33.00  0.02    -0.02 1.33
## RBS_total                52.00  1.26     1.86 1.54
## SSP_total               108.00 -0.58    -0.49 4.56
## vabsdscoresc_dss         72.00  0.50     0.04 1.96
## vabsdscoresd_dss         81.00  0.29     0.37 2.15
## vabsdscoress_dss         73.00 -0.77     1.19 2.06
## vabsabcabc_standard      69.00  0.46     0.53 1.70
## ------------------------------------------------------------ 
## subgrp: SC_over_RRB
## dataset: Replication
##                         vars  n   mean    sd median trimmed   mad   min    max
## dataset*                   1 68    NaN    NA     NA     NaN    NA   Inf   -Inf
## subgrp*                    2 68    NaN    NA     NA     NaN    NA   Inf   -Inf
## age                        3 68  16.10  5.12  15.63   15.96  6.08  7.48  29.23
## meanFD                     4 68   0.26  0.28   0.17    0.20  0.11  0.04   1.59
## viq_all                    5 65  96.31 19.73  99.00   97.55 20.76 50.91 130.00
## piq_all                    6 67  98.83 20.92 104.00  100.28 20.76 44.03 138.00
## fsiq4_all                  7 66  98.22 19.41 103.00   99.20 19.46 59.00 139.00
## A_pct_severity             8 68   0.46  0.15   0.46    0.46  0.19  0.19   0.75
## B_pct_severity             9 68   0.20  0.12   0.18    0.19  0.13  0.00   0.47
## ADI_social_total          10 68  18.47  5.69  19.00   18.66  5.93  6.00  29.00
## ADI_communication_total   11 68  14.82  4.71  15.50   14.96  5.19  4.00  24.00
## ADI_RRB_total             12 68   3.81  2.46   3.50    3.66  2.22  0.00  10.00
## ados_2_SA_CSS             13 65   6.18  2.83   6.00    6.28  4.45  1.00  10.00
## ados_2_RRB_CSS            14 65   4.45  2.80   5.00    4.32  2.97  1.00   9.00
## SRS_tscore                15 59  74.73 11.62  78.00   75.53 13.34 48.00  90.00
## SRS_tscore_self           16 32  63.16 10.33  61.50   62.96  8.90 40.00  84.00
## RBS_total                 17 59  19.02 15.25  15.00   17.43 13.34  0.00  73.00
## SSP_total                 18 46 139.11 25.30 139.50  139.42 28.17 91.00 184.00
## vabsdscoresc_dss          19 62  73.24 14.96  74.50   73.82 11.12 21.00 110.00
## vabsdscoresd_dss          20 62  70.98 15.74  68.00   70.62 15.57 42.00 118.00
## vabsdscoress_dss          21 62  66.32 16.12  67.50   66.42 14.83 23.00 112.00
## vabsabcabc_standard       22 62  68.23 13.79  68.50   68.58 11.12 28.00 107.00
##                         range  skew kurtosis   se
## dataset*                 -Inf    NA       NA   NA
## subgrp*                  -Inf    NA       NA   NA
## age                     21.75  0.29    -0.78 0.62
## meanFD                   1.54  3.07    10.17 0.03
## viq_all                 79.09 -0.54    -0.55 2.45
## piq_all                 93.97 -0.58    -0.48 2.56
## fsiq4_all               80.00 -0.44    -0.73 2.39
## A_pct_severity           0.56 -0.01    -1.04 0.02
## B_pct_severity           0.47  0.37    -0.77 0.01
## ADI_social_total        23.00 -0.32    -0.74 0.69
## ADI_communication_total 20.00 -0.28    -0.83 0.57
## ADI_RRB_total           10.00  0.53    -0.53 0.30
## ados_2_SA_CSS            9.00 -0.17    -1.19 0.35
## ados_2_RRB_CSS           8.00 -0.09    -1.37 0.35
## SRS_tscore              42.00 -0.50    -0.78 1.51
## SRS_tscore_self         44.00  0.17    -0.27 1.83
## RBS_total               73.00  1.14     1.08 1.99
## SSP_total               93.00 -0.16    -0.94 3.73
## vabsdscoresc_dss        89.00 -0.70     2.19 1.90
## vabsdscoresd_dss        76.00  0.39    -0.11 2.00
## vabsdscoress_dss        89.00 -0.04     0.39 2.05
## vabsabcabc_standard     79.00 -0.27     1.23 1.75
## ------------------------------------------------------------ 
## subgrp: TD
## dataset: Replication
##                         vars   n   mean    sd median trimmed   mad   min    max
## dataset*                   1 122    NaN    NA     NA     NaN    NA   Inf   -Inf
## subgrp*                    2 122    NaN    NA     NA     NaN    NA   Inf   -Inf
## age                        3 122  16.86  6.07  16.34   16.58  7.59  6.89  29.72
## meanFD                     4 122   0.23  0.46   0.14    0.15  0.07  0.04   4.60
## viq_all                    5 122 104.02 17.58 108.18  105.64 12.13 45.00 140.00
## piq_all                    6 122 104.64 18.41 108.96  106.56 14.08 49.00 139.00
## fsiq4_all                  7 122 104.94 17.14 108.09  107.29 11.86 50.00 134.00
## A_pct_severity             8   0    NaN    NA     NA     NaN    NA   Inf   -Inf
## B_pct_severity             9   0    NaN    NA     NA     NaN    NA   Inf   -Inf
## ADI_social_total          10   0    NaN    NA     NA     NaN    NA   Inf   -Inf
## ADI_communication_total   11   0    NaN    NA     NA     NaN    NA   Inf   -Inf
## ADI_RRB_total             12   0    NaN    NA     NA     NaN    NA   Inf   -Inf
## ados_2_SA_CSS             13   0    NaN    NA     NA     NaN    NA   Inf   -Inf
## ados_2_RRB_CSS            14   0    NaN    NA     NA     NaN    NA   Inf   -Inf
## SRS_tscore                15  65  47.23  9.34  44.00   45.66  4.45 37.00  90.00
## SRS_tscore_self           16  61  48.44  6.84  47.00   47.63  5.93 39.00  69.00
## RBS_total                 17  63   3.08 11.54   0.00    0.86  0.00  0.00  89.00
## SSP_total                 18  54 174.93 19.38 182.00  178.41  6.67 75.00 190.00
## vabsdscoresc_dss          19  39  92.74 25.45  96.00   95.82 20.76 21.00 125.00
## vabsdscoresd_dss          20  39  91.10 22.65  97.00   93.91 14.83 27.00 122.00
## vabsdscoress_dss          21  39  98.90 27.04 103.00  102.12 17.79 20.00 132.00
## vabsabcabc_standard       22  38  93.00 25.39 100.00   96.44 15.57 20.00 126.00
##                          range  skew kurtosis   se
## dataset*                  -Inf    NA       NA   NA
## subgrp*                   -Inf    NA       NA   NA
## age                      22.83  0.33    -0.97 0.55
## meanFD                    4.56  7.54    64.83 0.04
## viq_all                  95.00 -0.99     1.51 1.59
## piq_all                  90.00 -0.93     0.67 1.67
## fsiq4_all                84.00 -1.28     1.67 1.55
## A_pct_severity            -Inf    NA       NA   NA
## B_pct_severity            -Inf    NA       NA   NA
## ADI_social_total          -Inf    NA       NA   NA
## ADI_communication_total   -Inf    NA       NA   NA
## ADI_RRB_total             -Inf    NA       NA   NA
## ados_2_SA_CSS             -Inf    NA       NA   NA
## ados_2_RRB_CSS            -Inf    NA       NA   NA
## SRS_tscore               53.00  2.14     5.82 1.16
## SRS_tscore_self          30.00  1.05     0.48 0.88
## RBS_total                89.00  6.60    45.89 1.45
## SSP_total               115.00 -2.88    10.92 2.64
## vabsdscoresc_dss        104.00 -1.21     1.00 4.08
## vabsdscoresd_dss         95.00 -1.34     1.26 3.63
## vabsdscoress_dss        112.00 -1.26     1.10 4.33
## vabsabcabc_standard     106.00 -1.42     1.40 4.12
#------------------------------------------------------------------------------
# Table of subtypes X sex

# Discovery
data2use = subset(data2write,data2write$dataset=="Discovery")
table(data2use$subgrp,data2use$sex)
##               
##                Female Male
##   RRB_over_SC       4    6
##   SC_equal_RRB     15   41
##   SC_over_RRB      16   51
##   TD               41   80
cs_res = chisq.test(data2use$subgrp,data2use$sex)
cs_res
## 
##  Pearson's Chi-squared test
## 
## data:  data2use$subgrp and data2use$sex
## X-squared = 2.8193, df = 3, p-value = 0.4203
# Replication
data2use = subset(data2write,data2write$dataset=="Replication")
table(data2use$subgrp,data2use$sex)
##               
##                Female Male
##   RRB_over_SC       1    2
##   SC_equal_RRB     15   47
##   SC_over_RRB      20   48
##   TD               47   75
cs_res = chisq.test(data2use$subgrp,data2use$sex)
cs_res
## 
##  Pearson's Chi-squared test
## 
## data:  data2use$subgrp and data2use$sex
## X-squared = 4.2615, df = 3, p-value = 0.2346
#------------------------------------------------------------------------------
vars2analyze = c("age","meanFD","viq_all","piq_all","fsiq4_all",
                 "A_pct_severity","B_pct_severity",
                 "ADI_social_total","ADI_communication_total","ADI_RRB_total",
                 "ados_2_SA_CSS","ados_2_RRB_CSS",
                 "SRS_tscore_self","RBS_total","SSP_total",
                 "vabsdscoress_dss","vabsdscoresd_dss","vabsdscoresc_dss","vabsabcabc_standard")

vnames = c("Age","Mean FD","VIQ","PIQ","FIQ","ADI-R SC","ADI-R RRB","ADI-R Social","ADI-R Communication","ADI-R RRB","ADOS SA CSS","ADOS RRB CSS","SRS","RBS","SSP","Vineland Socialization","Vineland Daily Living Skills","Vineland Communication","Vineland ABC")

cnames = c("All_Disc.fstat","All_Disc.pval","All_Rep.fstat","All_Rep.pval",
           "SCequalRRB_vs_SCoverRRB_Disc.fstat","SCequalRRB_vs_SCoverRRB_Disc.tstat",
           "SCequalRRB_vs_SCoverRRB_Disc.pval","SCequalRRB_vs_SCoverRRB_Disc.es",
           "SCequalRRB_vs_SCoverRRB_Rep.fstat","SCequalRRB_vs_SCoverRRB_Rep.tstat",
           "SCequalRRB_vs_SCoverRRB_Rep.pval","SCequalRRB_vs_SCoverRRB_Rep.es",
           "SCequalRRB_vs_SCoverRRB.repBF")

output_res = data.frame(matrix(nrow = length(vars2analyze),ncol = length(cnames)))
colnames(output_res) = cnames
rownames(output_res) = vars2analyze
output_res$varNames = vars2analyze

for (ivar in 1:length(vars2analyze)){

  y_var = vars2analyze[ivar]
  # print(y_var)
  #----------------------------------------------------------------------------
  # Discovery
  df4mod = subset(df2use, df2use$dataset=="Discovery")

  # construct linear model
  # mixed-effect model: site as random factor, all other covariates as fixed factors
  fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
  rx_form = as.formula(sprintf("~ 1|%s","Centre"))
  mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))

  # run ANOVA
  res = anova(mod2use)
  output_res[y_var,"All_Disc.fstat"] = res["subgrp","F-value"]
  output_res[y_var,"All_Disc.pval"] = res["subgrp","p-value"]

  #----------------------------------------------------------------------------
  # Replication
  df4mod = subset(df2use, df2use$dataset=="Replication")

  # construct linear model
  # mixed-effect model: site as random factor, all other covariates as fixed factors
  fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
  rx_form = as.formula(sprintf("~ 1|%s","Centre"))
  mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))

  # run ANOVA
  res = anova(mod2use)
  output_res[y_var,"All_Rep.fstat"] = res["subgrp","F-value"]
  output_res[y_var,"All_Rep.pval"] = res["subgrp","p-value"]

  #----------------------------------------------------------------------------
  # Discovery
  df4mod = subset(df2use, df2use$dataset=="Discovery" & !is.element(df2use$subgrp,c("TD","RRB_over_SC")))
  n1 = sum(df4mod$subgrp=="SC_equal_RRB")
  m1 = sum(df4mod$subgrp=="SC_over_RRB")

  # construct linear model
  # mixed-effect model: site as random factor, all other covariates as fixed factors
  fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
  rx_form = as.formula(sprintf("~ 1|%s","Centre"))
  mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))

  # run ANOVA
  res = anova(mod2use)
  output_res[y_var,"SCequalRRB_vs_SCoverRRB_Disc.fstat"] = res["subgrp","F-value"]
  output_res[y_var,"SCequalRRB_vs_SCoverRRB_Disc.pval"] = res["subgrp","p-value"]
  res = summary(mod2use)
  output_res[y_var,"SCequalRRB_vs_SCoverRRB_Disc.tstat"] = res$tTable[2,"t-value"]
  output_res[y_var,"SCequalRRB_vs_SCoverRRB_Disc.es"] = cohens_d(df4mod[df4mod$subgrp=="SC_equal_RRB",y_var],
                                                                 df4mod[df4mod$subgrp=="SC_over_RRB",y_var])

  #----------------------------------------------------------------------------
  # Replication
  df4mod = subset(df2use, df2use$dataset=="Replication" & !is.element(df2use$subgrp,c("TD","RRB_over_SC")))
  n2 = sum(df4mod$subgrp=="SC_equal_RRB")
  m2 = sum(df4mod$subgrp=="SC_over_RRB")

  # construct linear model
  # mixed-effect model: site as random factor, all other covariates as fixed factors
  fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
  rx_form = as.formula(sprintf("~ 1|%s","Centre"))
  mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))

  # run ANOVA
  res = anova(mod2use)
  output_res[y_var,"SCequalRRB_vs_SCoverRRB_Rep.fstat"] = res["subgrp","F-value"]
  output_res[y_var,"SCequalRRB_vs_SCoverRRB_Rep.pval"] = res["subgrp","p-value"]
  res = summary(mod2use)
  output_res[y_var,"SCequalRRB_vs_SCoverRRB_Rep.tstat"] = res$tTable[2,"t-value"]
  output_res[y_var,"SCequalRRB_vs_SCoverRRB_Rep.es"] = cohens_d(df4mod[df4mod$subgrp=="SC_equal_RRB",y_var],
                                                                 df4mod[df4mod$subgrp=="SC_over_RRB",y_var])

  #----------------------------------------------------------------------------
  # Replication Bayes Factor
  res_bf = BFSALL(tobs = output_res[y_var,"SCequalRRB_vs_SCoverRRB_Disc.tstat"],
                  trep = output_res[y_var,"SCequalRRB_vs_SCoverRRB_Disc.tstat"],
                  n1 = n1,
                  n2 = n2,
                  m1 = m1,
                  m2 = m2,
                  sample = 2,
                  Type = 'ALL')
  output_res[y_var,"SCequalRRB_vs_SCoverRRB.repBF"] = res_bf["Replication BF","Replication 1"]

  # make a plot
  colors2use = get_ggColorHue(3)
  df4plot = subset(df2use, df2use$subgrp!="RRB_over_SC")
  p = ggplot(data = df4plot, aes_string(x = "subgrp", y = y_var, colour = "subgrp")) + facet_grid(. ~ dataset)
  p = p + geom_jitter() + geom_boxplot(fill = NA, colour = "#000000", outlier.shape = NA)
  p = p + ylab(vnames[ivar]) + xlab("Group") +
    scale_x_discrete(labels=c("SC_equal_RRB"="SC=RRB","SC_over_RRB"="SC>RRB","TD"="TD")) +
                       scale_colour_manual(values = c(colors2use[2:3],"grey60")) + guides(colour=FALSE) +
    theme(text = element_text(size=fontSize-5),
        axis.text.x = element_text(size=fontSize-5),
        axis.text.y = element_text(size=fontSize-5))
  print(p)

}

vabc["0.7","Discovery"] = output_res["vabsabcabc_standard","SCequalRRB_vs_SCoverRRB_Disc.es"]
vabc["0.7","Replication"] = output_res["vabsabcabc_standard","SCequalRRB_vs_SCoverRRB_Rep.es"]
vabc_dls["0.7","Discovery"] = output_res["vabsdscoresd_dss","SCequalRRB_vs_SCoverRRB_Disc.es"]
vabc_dls["0.7","Replication"] = output_res["vabsdscoresd_dss","SCequalRRB_vs_SCoverRRB_Rep.es"]
output_res
##                         All_Disc.fstat All_Disc.pval All_Rep.fstat All_Rep.pval
## age                          0.4443736  7.214972e-01    0.57911736 6.292321e-01
## meanFD                       2.3454190  7.345813e-02    0.09283242 9.639462e-01
## viq_all                      2.0746336  1.041775e-01    1.96069026 1.204982e-01
## piq_all                      1.5490396  2.024688e-01    0.92349384 4.300367e-01
## fsiq4_all                    2.2140636  8.706621e-02    1.32551779 2.666330e-01
## A_pct_severity              28.5757233  5.661294e-11   55.06317758 0.000000e+00
## B_pct_severity              30.4467534  1.578115e-11    3.65737898 2.855480e-02
## ADI_social_total             0.9981866  3.714198e-01    9.46612207 1.472079e-04
## ADI_communication_total      1.1377968  3.237695e-01   11.08966914 3.639446e-05
## ADI_RRB_total               34.6469967  9.816592e-13    0.44356732 6.427334e-01
## ados_2_SA_CSS                2.1610826  1.195199e-01    1.34800309 2.635994e-01
## ados_2_RRB_CSS               1.1598655  3.169061e-01    0.85013622 4.298725e-01
## SRS_tscore_self             37.7795551  0.000000e+00   32.89311674 1.776357e-15
## RBS_total                   18.8124369  1.387963e-10   15.53716896 5.618780e-09
## SSP_total                   31.9881267  1.554312e-15   22.88230998 6.086132e-12
## vabsdscoress_dss            22.9718095  2.712497e-12   24.80873326 3.891332e-13
## vabsdscoresd_dss            11.6170742  6.934610e-07   10.55576140 2.398491e-06
## vabsdscoresc_dss             9.4813361  8.952522e-06    8.85284807 1.900485e-05
## vabsabcabc_standard         17.8307895  6.086744e-10   17.49695128 8.412286e-10
##                         SCequalRRB_vs_SCoverRRB_Disc.fstat
## age                                           7.735489e-03
## meanFD                                        8.606177e-01
## viq_all                                       1.056295e-02
## piq_all                                       1.459087e-02
## fsiq4_all                                     9.379364e-04
## A_pct_severity                                3.250674e+01
## B_pct_severity                                3.873294e+01
## ADI_social_total                              1.235755e+00
## ADI_communication_total                       1.982731e+00
## ADI_RRB_total                                 3.720159e+01
## ados_2_SA_CSS                                 1.443778e-01
## ados_2_RRB_CSS                                4.365431e-01
## SRS_tscore_self                               2.298479e+00
## RBS_total                                     2.171167e-01
## SSP_total                                     1.451255e+00
## vabsdscoress_dss                              1.824253e+00
## vabsdscoresd_dss                              2.978555e-01
## vabsdscoresc_dss                              9.320227e-02
## vabsabcabc_standard                           1.294766e+00
##                         SCequalRRB_vs_SCoverRRB_Disc.tstat
## age                                            -0.08795163
## meanFD                                         -0.92769484
## viq_all                                         0.10277623
## piq_all                                        -0.12079267
## fsiq4_all                                       0.03062575
## A_pct_severity                                  5.70146849
## B_pct_severity                                 -6.22357950
## ADI_social_total                                1.11164497
## ADI_communication_total                         1.40809488
## ADI_RRB_total                                  -6.09931090
## ados_2_SA_CSS                                   0.37997078
## ados_2_RRB_CSS                                 -0.66071411
## SRS_tscore_self                                 1.51607362
## RBS_total                                      -0.46595786
## SSP_total                                       1.20468026
## vabsdscoress_dss                               -1.35064915
## vabsdscoresd_dss                               -0.54576142
## vabsdscoresc_dss                               -0.30529046
## vabsabcabc_standard                            -1.13787792
##                         SCequalRRB_vs_SCoverRRB_Disc.pval
## age                                          9.300642e-01
## meanFD                                       3.554596e-01
## viq_all                                      9.183180e-01
## piq_all                                      9.040642e-01
## fsiq4_all                                    9.756197e-01
## A_pct_severity                               8.929487e-08
## B_pct_severity                               7.669901e-09
## ADI_social_total                             2.685506e-01
## ADI_communication_total                      1.617322e-01
## ADI_RRB_total                                1.388656e-08
## ados_2_SA_CSS                                7.046679e-01
## ados_2_RRB_CSS                               5.101171e-01
## SRS_tscore_self                              1.356750e-01
## RBS_total                                    6.422590e-01
## SSP_total                                    2.323840e-01
## vabsdscoress_dss                             1.795305e-01
## vabsdscoresd_dss                             5.863251e-01
## vabsdscoresc_dss                             7.607116e-01
## vabsabcabc_standard                          2.576216e-01
##                         SCequalRRB_vs_SCoverRRB_Disc.es
## age                                         0.015924500
## meanFD                                      0.167967896
## viq_all                                    -0.053002168
## piq_all                                     0.021308962
## fsiq4_all                                  -0.005545139
## A_pct_severity                             -0.788506161
## B_pct_severity                              1.179425752
## ADI_social_total                           -0.044849376
## ADI_communication_total                    -0.028505053
## ADI_RRB_total                               1.174142890
## ados_2_SA_CSS                              -0.059546879
## ados_2_RRB_CSS                              0.155478951
## SRS_tscore_self                            -0.167048495
## RBS_total                                   0.174333540
## SSP_total                                  -0.320171280
## vabsdscoress_dss                            0.121774707
## vabsdscoresd_dss                           -0.002137628
## vabsdscoresc_dss                            0.049639893
## vabsabcabc_standard                         0.128410012
##                         SCequalRRB_vs_SCoverRRB_Rep.fstat
## age                                          5.468116e-01
## meanFD                                       2.977673e-01
## viq_all                                      2.006791e+00
## piq_all                                      7.424804e-01
## fsiq4_all                                    1.054057e+00
## A_pct_severity                               1.037983e+02
## B_pct_severity                               2.356628e+00
## ADI_social_total                             1.852925e+01
## ADI_communication_total                      1.857849e+01
## ADI_RRB_total                                1.400011e-03
## ados_2_SA_CSS                                1.777220e+00
## ados_2_RRB_CSS                               1.668320e+00
## SRS_tscore_self                              9.192707e-01
## RBS_total                                    8.944074e+00
## SSP_total                                    1.822205e-01
## vabsdscoress_dss                             1.613878e+01
## vabsdscoresd_dss                             1.028992e+01
## vabsdscoresc_dss                             1.072178e+01
## vabsabcabc_standard                          2.238903e+01
##                         SCequalRRB_vs_SCoverRRB_Rep.tstat
## age                                           -0.73946708
## meanFD                                         0.54568060
## viq_all                                       -1.41661237
## piq_all                                       -0.86167302
## fsiq4_all                                     -1.02667265
## A_pct_severity                                10.18814623
## B_pct_severity                                -1.53513138
## ADI_social_total                               4.30456208
## ADI_communication_total                        4.31027765
## ADI_RRB_total                                  0.03741673
## ados_2_SA_CSS                                  1.33312422
## ados_2_RRB_CSS                                -1.29163473
## SRS_tscore_self                                0.95878608
## RBS_total                                      2.99066443
## SSP_total                                     -0.42687292
## vabsdscoress_dss                              -4.01731065
## vabsdscoresd_dss                              -3.20779037
## vabsdscoresc_dss                              -3.27441347
## vabsabcabc_standard                           -4.73170469
##                         SCequalRRB_vs_SCoverRRB_Rep.pval
## age                                         4.610099e-01
## meanFD                                      5.862580e-01
## viq_all                                     1.591657e-01
## piq_all                                     3.905442e-01
## fsiq4_all                                   3.066059e-01
## A_pct_severity                              0.000000e+00
## B_pct_severity                              1.272789e-01
## ADI_social_total                            3.346762e-05
## ADI_communication_total                     3.272369e-05
## ADI_RRB_total                               9.702124e-01
## ados_2_SA_CSS                               1.850164e-01
## ados_2_RRB_CSS                              1.989659e-01
## SRS_tscore_self                             3.417159e-01
## RBS_total                                   3.455054e-03
## SSP_total                                   6.706787e-01
## vabsdscoress_dss                            1.063990e-04
## vabsdscoresd_dss                            1.744364e-03
## vabsdscoresc_dss                            1.405708e-03
## vabsabcabc_standard                         6.546637e-06
##                         SCequalRRB_vs_SCoverRRB_Rep.es
## age                                         0.12984949
## meanFD                                     -0.09582082
## viq_all                                     0.30061512
## piq_all                                     0.22307565
## fsiq4_all                                   0.25203121
## A_pct_severity                             -1.55030116
## B_pct_severity                              0.36362290
## ADI_social_total                           -0.64438806
## ADI_communication_total                    -0.61676272
## ADI_RRB_total                               0.07980667
## ados_2_SA_CSS                              -0.23061201
## ados_2_RRB_CSS                              0.23515014
## SRS_tscore_self                            -0.24311317
## RBS_total                                  -0.41169112
## SSP_total                                   0.02536933
## vabsdscoress_dss                            0.62933627
## vabsdscoresd_dss                            0.49686127
## vabsdscoresc_dss                            0.60368112
## vabsabcabc_standard                         0.76211234
##                         SCequalRRB_vs_SCoverRRB.repBF                varNames
## age                                      6.992496e-01                     age
## meanFD                                   1.069978e+00                  meanFD
## viq_all                                  6.991060e-01                 viq_all
## piq_all                                  7.014257e-01                 piq_all
## fsiq4_all                                6.964927e-01               fsiq4_all
## A_pct_severity                           1.424045e+06          A_pct_severity
## B_pct_severity                           1.631060e+07          B_pct_severity
## ADI_social_total                         1.290726e+00        ADI_social_total
## ADI_communication_total                  1.869007e+00 ADI_communication_total
## ADI_RRB_total                            9.018462e+06           ADI_RRB_total
## ados_2_SA_CSS                            7.482395e-01           ados_2_SA_CSS
## ados_2_RRB_CSS                           8.654343e-01          ados_2_RRB_CSS
## SRS_tscore_self                          2.186721e+00         SRS_tscore_self
## RBS_total                                7.758777e-01               RBS_total
## SSP_total                                1.435949e+00               SSP_total
## vabsdscoress_dss                         1.724381e+00        vabsdscoress_dss
## vabsdscoresd_dss                         8.084267e-01        vabsdscoresd_dss
## vabsdscoresc_dss                         7.292319e-01        vabsdscoresc_dss
## vabsabcabc_standard                      1.326929e+00     vabsabcabc_standard

SC-RRB difference z = 0.8

#------------------------------------------------------------------------------
# Z-score threshold to use for subtyping
z_thresh = 0.8

# vars2use = c("dbaes_atotal","dbaes_btotal")

# compute Discovery mean and sd
ds_disc = Dverbal_Discovery[,vars2use[1]] - Dverbal_Discovery[,vars2use[2]]
mean2use = mean(ds_disc)
sd2use = sd(ds_disc)

Dverbal_Discovery = make_subtype(data2use = Dverbal_Discovery,
                                 z_thresh = z_thresh,
                                 mean2use = mean2use,
                                 sd2use = sd2use)

# compute Replication mean and sd
ds_rep = Dverbal_Replication[,vars2use[1]] - Dverbal_Replication[,vars2use[2]]
mean2use = mean(ds_rep)
sd2use = sd(ds_rep)

Dverbal_Replication = make_subtype(data2use = Dverbal_Replication,
                                 z_thresh = z_thresh,
                                 mean2use = mean2use,
                                 sd2use = sd2use)

#------------------------------------------------------------------------------
# Table of subtypes X sex

# Discovery
table(Dverbal_Discovery$z_ds_group,Dverbal_Discovery$sex)
##               
##                  F   M
##   RRB_over_SC   35 143
##   SC_equal_RRB 125 413
##   SC_over_RRB   37 136
cs_res = chisq.test(Dverbal_Discovery$z_ds_group,Dverbal_Discovery$sex)
cs_res
## 
##  Pearson's Chi-squared test
## 
## data:  Dverbal_Discovery$z_ds_group and Dverbal_Discovery$sex
## X-squared = 1.0632, df = 2, p-value = 0.5877
# Replication
table(Dverbal_Replication$z_ds_group,Dverbal_Replication$sex)
##               
##                  F   M
##   RRB_over_SC   31 146
##   SC_equal_RRB 130 422
##   SC_over_RRB   35 126
cs_res = chisq.test(Dverbal_Replication$z_ds_group,Dverbal_Replication$sex)
cs_res
## 
##  Pearson's Chi-squared test
## 
## data:  Dverbal_Replication$z_ds_group and Dverbal_Replication$sex
## X-squared = 2.8532, df = 2, p-value = 0.2401
#------------------------------------------------------------------------------
# Descriptive stats

# Discovery
df2use = Dverbal_Discovery[,c("subjectkey","dataset_id","collection_id","interview_age","sex","z_ds_group","ados_age","ados_sa_css","ados_rrb_css","iq","dbaes_atotal","dbaes_btotal")]
df2use$age = df2use$interview_age/12

cols2use = c("z_ds_group","sex","age","ados_age","ados_sa_css","ados_rrb_css","iq","dbaes_atotal","dbaes_btotal")
res=describeBy(x = df2use[,cols2use], group = "z_ds_group")
res
## 
##  Descriptive statistics by group 
## group: RRB_over_SC
##              vars   n   mean    sd median trimmed   mad   min    max  range
## z_ds_group*     1 178   1.00  0.00   1.00    1.00  0.00  1.00   1.00   0.00
## sex*            2 178    NaN    NA     NA     NaN    NA   Inf   -Inf   -Inf
## age             3 178  10.05  4.11   9.50    9.76  3.46  2.00  27.17  25.17
## ados_age        4  25  91.60 38.51  86.00   89.52 38.55 37.00 171.00 134.00
## ados_sa_css     5  25   6.44  2.45   7.00    6.52  2.97  2.00  10.00   8.00
## ados_rrb_css    6  25   7.52  2.14   8.00    7.76  1.48  1.00  10.00   9.00
## iq              7  47 101.53 15.74 101.00  102.41 16.31 54.00 139.00  85.00
## dbaes_atotal    8 178   0.22  0.11   0.21    0.21  0.12  0.01   0.48   0.46
## dbaes_btotal    9 178   0.48  0.12   0.48    0.48  0.14  0.25   0.79   0.54
##               skew kurtosis   se
## z_ds_group*    NaN      NaN 0.00
## sex*            NA       NA   NA
## age           0.86     1.27 0.31
## ados_age      0.48    -0.88 7.70
## ados_sa_css  -0.26    -1.21 0.49
## ados_rrb_css -1.12     1.22 0.43
## iq           -0.55     0.78 2.30
## dbaes_atotal  0.13    -0.79 0.01
## dbaes_btotal  0.22    -0.64 0.01
## ------------------------------------------------------------ 
## group: SC_equal_RRB
##              vars   n   mean    sd median trimmed   mad min    max  range  skew
## z_ds_group*     1 538   2.00  0.00   2.00    2.00  0.00   2   2.00   0.00   NaN
## sex*            2 538    NaN    NA     NA     NaN    NA Inf   -Inf   -Inf    NA
## age             3 538   9.06  5.47   8.08    8.30  4.69   0  45.75  45.75  1.99
## ados_age        4  85  82.36 45.77  71.00   77.48 47.44  27 202.00 175.00  0.73
## ados_sa_css     5  85   6.81  2.06   7.00    6.90  1.48   1  10.00   9.00 -0.34
## ados_rrb_css    6  85   7.67  2.30   8.00    8.06  1.48   1  10.00   9.00 -1.59
## iq              7 131 104.88 18.15 107.00  106.22 17.79  42 138.00  96.00 -0.84
## dbaes_atotal    8 538   0.30  0.14   0.30    0.30  0.14   0   0.70   0.70  0.03
## dbaes_btotal    9 538   0.31  0.14   0.31    0.32  0.14   0   0.68   0.68 -0.07
##              kurtosis   se
## z_ds_group*       NaN 0.00
## sex*               NA   NA
## age              6.78 0.24
## ados_age        -0.67 4.96
## ados_sa_css     -0.26 0.22
## ados_rrb_css     2.36 0.25
## iq               1.08 1.59
## dbaes_atotal    -0.28 0.01
## dbaes_btotal    -0.31 0.01
## ------------------------------------------------------------ 
## group: SC_over_RRB
##              vars   n   mean    sd median trimmed   mad   min    max  range
## z_ds_group*     1 173   3.00  0.00   3.00    3.00  0.00  3.00   3.00   0.00
## sex*            2 173    NaN    NA     NA     NaN    NA   Inf   -Inf   -Inf
## age             3 173   7.31  5.33   5.75    6.32  2.97  1.67  37.33  35.67
## ados_age        4  43  67.98 32.79  65.00   62.74 22.24 30.00 172.00 142.00
## ados_sa_css     5  43   7.49  1.52   7.00    7.51  1.48  4.00  10.00   6.00
## ados_rrb_css    6  43   8.21  1.73   8.00    8.40  1.48  1.00  10.00   9.00
## iq              7  21 103.29 21.68 111.00  104.71 19.27 40.00 140.00 100.00
## dbaes_atotal    8 173   0.49  0.13   0.48    0.49  0.12  0.20   0.87   0.67
## dbaes_btotal    9 173   0.22  0.10   0.23    0.22  0.10  0.00   0.47   0.47
##               skew kurtosis   se
## z_ds_group*    NaN      NaN 0.00
## sex*            NA       NA   NA
## age           2.45     7.70 0.40
## ados_age      1.57     2.41 5.00
## ados_sa_css  -0.08    -0.68 0.23
## ados_rrb_css -1.67     4.79 0.26
## iq           -0.91     1.18 4.73
## dbaes_atotal  0.31    -0.14 0.01
## dbaes_btotal  0.12    -0.30 0.01
# Replication
df2use = Dverbal_Replication[,c("subjectkey","dataset_id","collection_id","interview_age","sex","z_ds_group","ados_age","ados_sa_css","ados_rrb_css","iq","dbaes_atotal","dbaes_btotal")]
df2use$age = df2use$interview_age/12

cols2use = c("z_ds_group","sex","age","ados_age","ados_sa_css","ados_rrb_css","iq","dbaes_atotal","dbaes_btotal")
res=describeBy(x = df2use[,cols2use], group = "z_ds_group")
res
## 
##  Descriptive statistics by group 
## group: RRB_over_SC
##              vars   n   mean    sd median trimmed   mad   min    max  range
## z_ds_group*     1 177   1.00  0.00   1.00    1.00  0.00  1.00   1.00   0.00
## sex*            2 177    NaN    NA     NA     NaN    NA   Inf   -Inf   -Inf
## age             3 177   9.92  4.68   9.33    9.46  3.83  3.00  28.58  25.58
## ados_age        4  14  92.43 52.75  79.00   89.08 58.56 36.00 189.00 153.00
## ados_sa_css     5  14   6.64  2.21   7.00    6.67  2.22  3.00  10.00   7.00
## ados_rrb_css    6  14   6.93  1.44   7.00    6.83  0.74  5.00  10.00   5.00
## iq              7  55 102.98 18.88 104.00  103.76 16.31 57.00 152.00  95.00
## dbaes_atotal    8 177   0.22  0.11   0.21    0.21  0.10  0.01   0.61   0.60
## dbaes_btotal    9 177   0.50  0.13   0.49    0.49  0.11  0.21   0.93   0.72
##               skew kurtosis    se
## z_ds_group*    NaN      NaN  0.00
## sex*            NA       NA    NA
## age           1.16     2.07  0.35
## ados_age      0.38    -1.40 14.10
## ados_sa_css  -0.34    -1.18  0.59
## ados_rrb_css  0.40    -0.49  0.38
## iq           -0.28     0.65  2.55
## dbaes_atotal  0.80     1.21  0.01
## dbaes_btotal  0.45     0.64  0.01
## ------------------------------------------------------------ 
## group: SC_equal_RRB
##              vars   n   mean    sd median trimmed   mad min    max  range  skew
## z_ds_group*     1 552   2.00  0.00   2.00    2.00  0.00   2   2.00   0.00   NaN
## sex*            2 552    NaN    NA     NA     NaN    NA Inf   -Inf   -Inf    NA
## age             3 552   8.84  5.22   7.88    8.09  4.45   0  33.83  33.83  1.64
## ados_age        4 107  80.40 38.24  74.00   76.39 43.00  35 196.00 161.00  0.82
## ados_sa_css     5 107   6.84  2.03   7.00    6.87  1.48   2  10.00   8.00 -0.02
## ados_rrb_css    6 107   7.37  2.37   8.00    7.71  1.48   1  10.00   9.00 -1.31
## iq              7 108 107.71 15.98 108.50  107.65 14.08  64 146.00  82.00 -0.02
## dbaes_atotal    8 552   0.31  0.14   0.31    0.31  0.13   0   0.78   0.78  0.04
## dbaes_btotal    9 552   0.33  0.14   0.32    0.33  0.14   0   0.81   0.81  0.05
##              kurtosis   se
## z_ds_group*       NaN 0.00
## sex*               NA   NA
## age              3.65 0.22
## ados_age         0.02 3.70
## ados_sa_css     -0.79 0.20
## ados_rrb_css     1.31 0.23
## iq              -0.13 1.54
## dbaes_atotal    -0.07 0.01
## dbaes_btotal     0.05 0.01
## ------------------------------------------------------------ 
## group: SC_over_RRB
##              vars   n   mean    sd median trimmed   mad   min    max  range
## z_ds_group*     1 161   3.00  0.00   3.00    3.00  0.00  3.00   3.00   0.00
## sex*            2 161    NaN    NA     NA     NaN    NA   Inf   -Inf   -Inf
## age             3 161   7.95  6.37   6.00    6.65  3.34  2.08  40.92  38.83
## ados_age        4  32  67.81 25.08  62.50   64.73 21.50 30.00 141.00 111.00
## ados_sa_css     5  32   7.31  1.79   7.00    7.35  1.48  3.00  10.00   7.00
## ados_rrb_css    6  32   7.53  2.36   8.00    7.85  1.48  1.00  10.00   9.00
## iq              7  23 108.91 20.06 110.00  109.53 20.76 62.00 146.00  84.00
## dbaes_atotal    8 161   0.50  0.13   0.50    0.50  0.14  0.23   0.96   0.73
## dbaes_btotal    9 161   0.22  0.11   0.22    0.22  0.12  0.00   0.50   0.50
##               skew kurtosis   se
## z_ds_group*    NaN      NaN 0.00
## sex*            NA       NA   NA
## age           2.32     6.20 0.50
## ados_age      1.14     1.16 4.43
## ados_sa_css  -0.16    -0.80 0.32
## ados_rrb_css -1.21     1.16 0.42
## iq           -0.35    -0.57 4.18
## dbaes_atotal  0.30     0.18 0.01
## dbaes_btotal  0.11    -0.65 0.01
#------------------------------------------------------------------------------
# Tests of differences in interview age across the subtypes

# Discovery
mod2use = lm(formula = interview_age ~ z_ds_group, data = Dverbal_Discovery)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: interview_age
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## z_ds_group   2   98234   49117  12.636 3.881e-06 ***
## Residuals  886 3443832    3887                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Replication
mod2use = lm(formula = interview_age ~ z_ds_group, data = Dverbal_Replication)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: interview_age
##             Df  Sum Sq Mean Sq F value   Pr(>F)   
## z_ds_group   2   47475 23737.6  5.7593 0.003273 **
## Residuals  887 3655889  4121.6                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#------------------------------------------------------------------------------
# Tests of differences in SC across the subtypes

# Discovery
mod2use = lm(formula = dbaes_atotal ~ z_ds_group, data = Dverbal_Discovery)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: dbaes_atotal
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## z_ds_group   2  7.3441  3.6721  220.92 < 2.2e-16 ***
## Residuals  886 14.7266  0.0166                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Replication
mod2use = lm(formula = dbaes_atotal ~ z_ds_group, data = Dverbal_Replication)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: dbaes_atotal
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## z_ds_group   2  7.1446  3.5723  206.14 < 2.2e-16 ***
## Residuals  887 15.3708  0.0173                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#------------------------------------------------------------------------------
# Tests of differences in RRB across the subtypes

# Discovery
mod2use = lm(formula = dbaes_btotal ~ z_ds_group, data = Dverbal_Discovery)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: dbaes_btotal
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## z_ds_group   2  6.3201 3.16005  199.92 < 2.2e-16 ***
## Residuals  886 14.0047 0.01581                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Replication
mod2use = lm(formula = dbaes_btotal ~ z_ds_group, data = Dverbal_Replication)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: dbaes_btotal
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## z_ds_group   2  6.7058  3.3529  190.36 < 2.2e-16 ***
## Residuals  887 15.6228  0.0176                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#------------------------------------------------------------------------------
# Tests of differences in ADOS SA CSS across the subtypes

# Discovery
mod2use = lm(formula = ados_sa_css ~ z_ds_group, data = Dverbal_Discovery)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: ados_sa_css
##             Df Sum Sq Mean Sq F value  Pr(>F)  
## z_ds_group   2  20.58 10.2891  2.5813 0.07903 .
## Residuals  150 597.89  3.9859                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Replication
mod2use = lm(formula = ados_sa_css ~ z_ds_group, data = Dverbal_Replication)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: ados_sa_css
##             Df Sum Sq Mean Sq F value Pr(>F)
## z_ds_group   2   6.67  3.3352  0.8333 0.4366
## Residuals  150 600.39  4.0026
#------------------------------------------------------------------------------
# Tests of differences in ADOS RRB CSS across the subtypes

# Discovery
mod2use = lm(formula = ados_rrb_css ~ z_ds_group, data = Dverbal_Discovery)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: ados_rrb_css
##             Df Sum Sq Mean Sq F value Pr(>F)
## z_ds_group   2  10.59  5.2931  1.1708 0.3129
## Residuals  150 678.13  4.5209
# Replication
mod2use = lm(formula = ados_rrb_css ~ z_ds_group, data = Dverbal_Replication)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: ados_rrb_css
##             Df Sum Sq Mean Sq F value Pr(>F)
## z_ds_group   2   3.56  1.7796  0.3362  0.715
## Residuals  150 793.94  5.2930
#------------------------------------------------------------------------------
# Tests of differences in IQ across the subtypes

# Discovery
mod2use = lm(formula = iq ~ z_ds_group, data = Dverbal_Discovery)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: iq
##             Df Sum Sq Mean Sq F value Pr(>F)
## z_ds_group   2    397  198.34  0.6109 0.5439
## Residuals  196  63632  324.65
# Replication
mod2use = lm(formula = iq ~ z_ds_group, data = Dverbal_Replication)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: iq
##             Df Sum Sq Mean Sq F value Pr(>F)
## z_ds_group   2    973  486.66  1.6066 0.2034
## Residuals  183  55433  302.91
#------------------------------------------------------------------------------
# Make scatterplots with difference score Z subtypes in different colors
maxScores = c(3,4)

p_disc = ggplot(data = Dverbal_Discovery, aes(x = dbaes_atotal, y = dbaes_btotal, colour = z_ds_group)) + geom_point() + xlab("SC") + ylab("RRB") + ylim(0,1) + xlim(0,1) + ggtitle("NDAR Discovery")
p1_top_left = p_disc + guides(colour=FALSE)
ggsave(filename = file.path(plotpath, sprintf("final_NDAR_Disc_scatterplot_z%s.pdf",as.character(z_thresh))), plot = p1_top_left)
p_disc

table(Dverbal_Discovery$z_ds_group)
## 
##  RRB_over_SC SC_equal_RRB  SC_over_RRB 
##          178          538          173
cor_res = cor.test(Dverbal_Discovery$dbaes_atotal, Dverbal_Discovery$dbaes_btotal)
cor_res
## 
##  Pearson's product-moment correlation
## 
## data:  Dverbal_Discovery$dbaes_atotal and Dverbal_Discovery$dbaes_btotal
## t = 6.6829, df = 887, p-value = 4.134e-11
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1554332 0.2806578
## sample estimates:
##       cor 
## 0.2189469
p_rep = ggplot(data = Dverbal_Replication, aes(x = dbaes_atotal, y = dbaes_btotal, colour = z_ds_group)) + geom_point() + xlab("SC") + ylab("RRB") + ylim(0,1) + xlim(0,1) +  ggtitle("NDAR Replication")
p2_bottom_left = p_rep + guides(colour=FALSE)
ggsave(filename = file.path(plotpath, sprintf("final_NDAR_Rep_scatterplot_z%s.pdf",as.character(z_thresh))), plot = p2_bottom_left)
p_rep

table(Dverbal_Replication$z_ds_group)
## 
##  RRB_over_SC SC_equal_RRB  SC_over_RRB 
##          177          552          161
cor_res = cor.test(Dverbal_Replication$dbaes_atotal, Dverbal_Replication$dbaes_btotal)
cor_res
## 
##  Pearson's product-moment correlation
## 
## data:  Dverbal_Replication$dbaes_atotal and Dverbal_Replication$dbaes_btotal
## t = 7.1459, df = 888, p-value = 1.864e-12
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1700824 0.2943934
## sample estimates:
##       cor 
## 0.2331903
#------------------------------------------------------------------------------
# Run supervised model with Discovery as training and Replication as Test

# run validation
# make subtypes using z-scores computed from the mean and sd of the training set
train_data = Dverbal_Discovery
test_data = Dverbal_Replication

mean2use = mean(train_data[,vars2use[1]] - train_data[,vars2use[2]])
sd2use = sd(train_data[,vars2use[1]] - train_data[,vars2use[2]])
tmp_train = make_subtype(data2use = train_data,
                         z_thresh = z_thresh,
                         mean2use = mean2use,
                         sd2use = sd2use)

mean2use = mean(test_data[,vars2use[1]] - test_data[,vars2use[2]])
sd2use = sd(test_data[,vars2use[1]] - test_data[,vars2use[2]])
tmp_test = make_subtype(data2use = test_data,
                        z_thresh = z_thresh,
                        mean2use = mean2use,
                        sd2use = sd2use)

#==============================================================================
#==============================================================================
# make labels based on train mean and sd
mean2use = mean(train_data[,vars2use[1]] - train_data[,vars2use[2]])
sd2use = sd(train_data[,vars2use[1]] - train_data[,vars2use[2]])
train_mean = mean2use
train_sd = sd2use

pred_labels = make_subtype(data2use = test_data,
                        z_thresh = z_thresh,
                        mean2use = train_mean,
                        sd2use = train_sd)
confmat = table(tmp_test$z_ds_group,pred_labels$z_ds_group)
acc = (confmat[1,1]+confmat[2,2]+confmat[3,3])/dim(pred_labels)[1]

# # compute model
# mod2use = svm(x = tmp_train[,vars2use], y = tmp_train$z_ds_group)
# pred_labels = predict(mod2use, tmp_test[,vars2use])
# confmat = table(tmp_test$z_ds_group,pred_labels)
# acc = (confmat[1,1]+confmat[2,2]+confmat[3,3])/length(pred_labels)
#==============================================================================
#==============================================================================

# plot confusion matrix
setHook("grid.newpage", function() pushViewport(viewport(x=1,y=1,width=0.9, height=0.9, name="vp", just=c("right","top"))), action="prepend")
pheatmap(confmat/rowSums(confmat)*100, display_numbers = confmat, color = colorRampPalette(c('white','red'))(100), cluster_rows = FALSE, cluster_cols = FALSE, fontsize_number = fontSize, fontsize_row = fontSize, fontsize_col = fontSize,labels_row = c("RRB>SC","SC=RRB","SC>RRB"),labels_col = c("RRB>SC","SC=RRB","SC>RRB"),angle_col=90,
         breaks= seq(0,100, length=100))
setHook("grid.newpage", NULL, "replace")
grid::grid.text("Actual Labels", y=-0.07, gp=gpar(fontsize=fontSize))
grid::grid.text("Predicted Labels", x=-0.07, rot=90, gp=gpar(fontsize=fontSize))

# show accuracy
true_accuracy = acc
true_accuracy
## [1] 0.9786517
#================================================================================
#================================================================================
# #------------------------------------------------------------------------------
# # Permute subtype labels to examine how well supervised model performs
#
# # nperm = 10000
#
# # make subtypes using z-scores computed from the mean and sd of the training set
# train_data = Dverbal_Discovery
# test_data = Dverbal_Replication
# mean2use = mean(train_data[,vars2use[1]] - train_data[,vars2use[2]])
# sd2use = sd(train_data[,vars2use[1]] - train_data[,vars2use[2]])
# tmp_train = make_subtype(data2use = train_data,
#                          z_thresh = z_thresh,
#                          mean2use = mean2use,
#                          sd2use = sd2use)
#
# mean2use = mean(test_data[,vars2use[1]] - test_data[,vars2use[2]])
# sd2use = sd(test_data[,vars2use[1]] - test_data[,vars2use[2]])
# tmp_test = make_subtype(data2use = test_data,
#                         z_thresh = z_thresh,
#                         mean2use = mean2use,
#                         sd2use = sd2use)
#
# acc = vector(length = nperm)
# for (iperm in 1:nperm){
#   # set seed for reproducibility
#   set.seed(iperm)
#
#   sc_perm = sample(train_data[,vars2use[1]])
#   rrb_perm = sample(train_data[,vars2use[2]])
#   perm_mean2use = mean(sc_perm - rrb_perm)
#   perm_sd2use = sd(sc_perm - rrb_perm)
#   # perm_mean2use = mean(train_data[,vars2use[1]] - rrb_perm)
#   # perm_sd2use = sd(train_data[,vars2use[1]] - rrb_perm)
#   pred_labels = make_subtype(data2use = tmp_test,
#                         z_thresh = z_thresh,
#                         mean2use = perm_mean2use,
#                         sd2use = perm_sd2use)
#   confmat = table(tmp_test$z_ds_group,pred_labels$z_ds_group)
#   acc = (confmat[1,1]+confmat[2,2]+confmat[3,3])/dim(pred_labels)[1]
#
#   # compute model
#   permuted_labels = sample(tmp_train$z_ds_group)
#   mod2use = svm(x = tmp_train[,vars2use], y = permuted_labels)
#   pred_labels = predict(mod2use, tmp_test[,vars2use])
#   confmat = table(tmp_test$z_ds_group,pred_labels)
#   acc[iperm] = (confmat[1,1]+confmat[2,2]+confmat[3,3])/length(pred_labels)
#   #============================================================================
# } # for (iperm in 1:nperm)
#
# df2plot = data.frame(Accuracy = acc)
# p = ggplot(data = df2plot, aes(x = Accuracy)) + geom_histogram() + geom_vline(xintercept=true_accuracy)
# p
#
# # compute p-value
# pval = sum(c(true_accuracy,acc)>=true_accuracy)/(nperm+1)
# pval
#================================================================================
#================================================================================

#------------------------------------------------------------------------------
# Plot difference score Z subtypes in Discovery set
maxScores = c(3,4)

# Discovery - make plot with all individuals shown as lines
df2use = Dverbal_Discovery[,c("subjectkey",adi_total_vars2use)]
df2use$subgrp = factor(tmp_train$z_ds_group)
df2use = data.frame(df2use)
df2use$subjectkey = factor(df2use$subjectkey)
df2use$SC = df2use$dbaes_atotal
df2use$RRB = df2use$dbaes_btotal

df4plot = melt(df2use,
               id.vars = c("subjectkey","subgrp"),
               measure.vars = c("SC","RRB"))

p = ggplot(data = df4plot, aes(x = variable,
                               y = value,
                               colour = subgrp,
                               group = subjectkey)) + facet_grid(. ~ subgrp)
p = p + geom_point(shape=1) + geom_line(alpha = 0.2) + ylim(0,1) + guides(color=FALSE)
p = p + ylab("Percent Severity") + xlab("ADI-R subscale")
p3_middle_top = p + guides(colour=FALSE)
ggsave(filename = file.path(plotpath, sprintf("final_NDAR_Disc_jitterplot_z%s.pdf",as.character(z_thresh))), plot = p3_middle_top)
p

# Plot difference score Z subtypes in Replication set
maxScores = c(3,4)

# Replication - make plot with all individuals shown as lines
df2use = Dverbal_Replication[,c("subjectkey",adi_total_vars2use)]
df2use$subgrp = factor(tmp_test$z_ds_group)
df2use = data.frame(df2use)
df2use$subjectkey = factor(df2use$subjectkey)
df2use$SC = df2use$dbaes_atotal
df2use$RRB = df2use$dbaes_btotal

df4plot = melt(df2use,
               id.vars = c("subjectkey","subgrp"),
               measure.vars = c("SC","RRB"))

p = ggplot(data = df4plot, aes(x = variable,
                               y = value,
                               colour = subgrp,
                               group = subjectkey)) + facet_grid(. ~ subgrp)
p = p + geom_point(shape=1) + geom_line(alpha = 0.2) + ylim(0,1) + guides(color=FALSE)
p = p + ylab("Percent Severity") + xlab("ADI-R subscale")
p4_middle_bottom = p + guides(colour=FALSE)
ggsave(filename = file.path(plotpath, sprintf("final_NDAR_Rep_jitterplot_z%s.pdf",as.character(z_thresh))), plot = p4_middle_bottom)
p

#------------------------------------------------------------------------------
# Apply NDAR subtypes to EU-AIMS LEAP data

# make SC and RRB percentages since EU-AIMS data is specified as percentages
Dverbal = read.csv(file.path(datapath,"tidy_verbal.csv"))
euaims_data = read.csv(file.path(datapath,"tidy_euaims.csv"))
mask1 = euaims_data$Diagnosis=="ASD"
mask2 =  (is.na(euaims_data$A1_pct_severity) | is.na(euaims_data$A2_pct_severity) | is.na(euaims_data$A3_pct_severity) | is.na(euaims_data$B1_pct_severity) | is.na(euaims_data$B2_pct_severity) | is.na(euaims_data$B3_pct_severity) | is.na(euaims_data$B4_pct_severity))
euaims_data = subset(euaims_data, (mask1 & !mask2))

Dverbal[,vars2use[1]] = Dverbal[,vars2use[1]]
Dverbal[,vars2use[2]] = Dverbal[,vars2use[2]]
euaims_data[,vars2use[1]] = (euaims_data$A1_pct_severity +
                               euaims_data$A2_pct_severity +
                               euaims_data$A3_pct_severity)/3
euaims_data[,vars2use[2]] = (euaims_data$B1_pct_severity +
                               euaims_data$B2_pct_severity +
                               euaims_data$B3_pct_severity +
                               euaims_data$B4_pct_severity)/4

train_data = Dverbal
test_data = euaims_data

mean2use = mean(train_data[,vars2use[1]] - train_data[,vars2use[2]], na.rm=TRUE)
sd2use = sd(train_data[,vars2use[1]] - train_data[,vars2use[2]], na.rm=TRUE)
c(mean2use, sd2use)
## [1] -0.01045243  0.19482749
tmp_train = make_subtype(data2use = train_data,
                         z_thresh = z_thresh,
                         mean2use = mean2use,
                         sd2use = sd2use)

tmp_test = make_subtype(data2use = test_data,
                        z_thresh = z_thresh,
                        mean2use = mean2use,
                        sd2use = sd2use)

#===========================================================================
#===========================================================================
# # compute model
# mod2use = svm(x = tmp_train[,vars2use], y = tmp_train$z_ds_group)
# pred_labels = predict(mod2use, tmp_test[,vars2use])
# confmat = table(tmp_test$z_ds_group,pred_labels)
# acc = (confmat[1,1]+confmat[2,2]+confmat[3,3])/length(pred_labels)
#
# tmp_test$svm_pred_labels = pred_labels
#
# # plot confusion matrix
# setHook("grid.newpage", function() pushViewport(viewport(x=1,y=1,width=0.9, height=0.9, name="vp", just=c("right","top"))), action="prepend")
# pheatmap(confmat/rowSums(confmat)*100, display_numbers = confmat, color = colorRampPalette(c('white','red'))(100), cluster_rows = FALSE, cluster_cols = FALSE, fontsize_number = fontSize, fontsize_row = fontSize, fontsize_col = fontSize,labels_row = c("RRB>SC","SC=RRB","SC>RRB"),labels_col = c("RRB>SC","SC=RRB","SC>RRB"),angle_col=90,
#          breaks= seq(0,100, length=100))
# setHook("grid.newpage", NULL, "replace")
# grid::grid.text("Actual Labels", y=-0.07, gp=gpar(fontsize=fontSize))
# grid::grid.text("Predicted Labels", x=-0.07, rot=90, gp=gpar(fontsize=fontSize))
#===========================================================================
#===========================================================================

# scatterplot
p1 = ggplot(data = tmp_train, aes(x = dbaes_atotal, y = dbaes_btotal, colour = factor(z_ds_group))) + geom_point() + xlab("Social-Communication") + ylab("Restricted Repetitive Behaviors") + ylim(0,0.8) + xlim(0,0.8) + ggtitle("NDAR ALL")
p1

p2 = ggplot(data = tmp_test, aes(x = dbaes_atotal, y = dbaes_btotal, colour = factor(z_ds_group))) + geom_point() + xlab("Social-Communication") + ylab("Restricted Repetitive Behaviors") + ylim(0,0.8) + xlim(0,0.8) + ggtitle("EU-AIMS with Groups from NDAR ALL")
p2

#===========================================================================
#===========================================================================
# p3 = ggplot(data = tmp_test, aes(x = dbaes_atotal, y = dbaes_btotal, colour = factor(pred_labels))) + geom_point() + xlab("SC") + ylab("RRB") + ylim(0,0.8) + xlim(0,0.8) + ggtitle("EU-AIMS")
# p5_bottom_right = p3 + guides(colour=FALSE)
# ggsave(filename = file.path(plotpath, sprintf("final_EUAIMS_scatterplot_z%s.pdf",as.character(z_thresh))), plot = p5_bottom_right)
# p3
#===========================================================================
#===========================================================================

# # write out EU-AIMS LEAP data with subgroups defined by NDAR All
write.csv(tmp_test, file.path(datapath, sprintf("tidy_euaims_NDAR_subtypes_diffscore_z%s.csv",as.character(z_thresh))))

#===========================================================================
#===========================================================================
# # Make final plot
# p_final = p1_top_left + p3_middle_top + plot_spacer() + p2_bottom_left + p4_middle_bottom + p5_bottom_right + plot_layout(nrow=3, ncol=3, widths = c(4,4,4), heights = c(8,8,8))
# ggsave(filename = file.path(plotpath, sprintf("final_NDAR_EUAIMS_subtypes_plot_z%s.pdf",as.character(z_thresh))), plot = p_final)
# p_final
#===========================================================================
#===========================================================================

#------------------------------------------------------------------------------
# Integrate subtypes with rest of EU-AIMS LEAP data
euaims_data = read.csv(file.path(datapath,"tidy_euaims.csv"))
td_df = subset(euaims_data, euaims_data$Diagnosis=="TD",select=2:6)
td_df$A_pct_severity = as.numeric(NA)
td_df$B_pct_severity = as.numeric(NA)
td_df$subgrp = "TD"

#===========================================================================
#===========================================================================
# cols2use = c("subid","age","Centre","Schedule","Diagnosis","dbaes_atotal","dbaes_btotal","svm_pred_labels")
cols2use = c("subid","age","Centre","Schedule","Diagnosis","dbaes_atotal","dbaes_btotal","z_ds_group")
#===========================================================================
#===========================================================================
tmp_asd = tmp_test[,cols2use]
colnames(tmp_asd)[6] = "A_pct_severity"
colnames(tmp_asd)[7] = "B_pct_severity"
colnames(tmp_asd)[8] = "subgrp"
asd_df = tmp_asd

all_data = rbind(td_df,asd_df)

fname = "/Users/mlombardo/Dropbox/euaims/data/rsfmri_preproc/euaims_preproc.xlsx"
pp_data = read_excel(fname)
mask = pp_data$notes=="ok"
pp_data = subset(pp_data, mask)

asd_df = merge(pp_data[,c("subid","sex")],asd_df, by = "subid")
td_df = merge(pp_data[,c("subid","sex")],td_df, by = "subid")

all_data = rbind(td_df,asd_df)

data2write = merge(pp_data, all_data, by = "subid")
data2write$age = data2write$age.x
data2write$sex = data2write$sex.y
print(table(data2write$Schedule, data2write$subgrp))
##    
##     RRB_over_SC SC_equal_RRB SC_over_RRB TD
##   A           6           39          35 78
##   B           1           47          37 83
##   C           4           33          26 59
##   D           0           13          25 23
print(table(data2write$Centre, data2write$subgrp))
##                
##                 RRB_over_SC SC_equal_RRB SC_over_RRB TD
##   CAMBRIDGE               4           29           9 29
##   KINGS_COLLEGE           6           51          49 78
##   MANNHEIM                0            0           0 34
##   NIJMEGEN                1           41          51 64
##   UTRECHT                 0           11          14 38
print(table(data2write$sex, data2write$subgrp))
##         
##          RRB_over_SC SC_equal_RRB SC_over_RRB  TD
##   Female           5           33          33  88
##   Male             6           99          90 155
#DSM-5 - find best split that balances participants across sites
a = findBestSplit(asd_df, seed_range = c(172342)) #,300001:500000))
## [1] 172342
best_seeds = a$seed[!is.na(a$discrepancy) & a$discrepancy==min(a$discrepancy, na.rm = TRUE)]
print(best_seeds)
## [1] 172342
# Split datasets -------------------------------------------------------------
rngSeed = best_seeds[1]

# split Schedule A dataset
dset2use = subset(asd_df, asd_df$Schedule=="A")
tmp_d = SplitDatasetsBySex(dset2use, rngSeed = rngSeed)
A_Discovery = tmp_d[[2]]
A_Replication = tmp_d[[1]]

# split Schedule B dataset
dset2use = subset(asd_df, asd_df$Schedule=="B")
tmp_d = SplitDatasetsBySex(dset2use, rngSeed = rngSeed)
B_Discovery = tmp_d[[2]]
B_Replication = tmp_d[[1]]

# split Schedule C dataset
dset2use = subset(asd_df, asd_df$Schedule=="C")
tmp_d = SplitDatasetsBySex(dset2use, rngSeed = rngSeed)
C_Discovery = tmp_d[[2]]
C_Replication = tmp_d[[1]]

# split Schedule D dataset
dset2use = subset(asd_df, asd_df$Schedule=="D")
tmp_d = SplitDatasetsBySex(dset2use, rngSeed = rngSeed)
D_Discovery = tmp_d[[2]]
D_Replication = tmp_d[[1]]

df_Disc = rbind(A_Discovery, B_Discovery, C_Discovery, D_Discovery)
df_Rep = rbind(A_Replication, B_Replication, C_Replication, D_Replication)

a = table(df_Disc$Schedule, df_Disc$Centre); print(a)
##    
##     CAMBRIDGE KINGS_COLLEGE NIJMEGEN UTRECHT
##   A         6            17       10       7
##   B         9            16       14       3
##   C         6             9       14       2
##   D         0            10       10       0
b = table(df_Rep$Schedule, df_Rep$Centre); print(b)
##    
##     CAMBRIDGE KINGS_COLLEGE NIJMEGEN UTRECHT
##   A         7            18       10       5
##   B         8            17       13       5
##   C         6            10       13       3
##   D         0             9        9       0
print(a-b)
##    
##     CAMBRIDGE KINGS_COLLEGE NIJMEGEN UTRECHT
##   A        -1            -1        0       2
##   B         1            -1        1      -2
##   C         0            -1        1      -1
##   D         0             1        1       0
print(sum(rowSums(abs(a-b))))
## [1] 14
data2write$dataset = NA
mask  = is.element(data2write$subid,df_Disc$subid)
data2write[mask,"dataset"] = "Discovery"
mask  = is.element(data2write$subid,df_Rep$subid)
data2write[mask,"dataset"] = "Replication"

asd_Disc = subset(data2write, data2write$dataset=="Discovery" & data2write$Diagnosis=="ASD")
asd_Rep = subset(data2write, data2write$dataset=="Replication" & data2write$Diagnosis=="ASD")

# # find which seed gives best TD age-match -------------------------------------
# seeds = 1:1000
# pvals = data.frame(matrix(nrow = length(seeds), ncol = 2))
# for (i in 1:length(seeds)) {
#   res = findTDAgeMatch(data2write, seed_range = c(seeds[i],seeds[i]))
#   td_Disc_matched = res[[2]]
#   td_Rep_matched = res[[1]]
#   tres = t.test(td_Disc_matched$age, asd_Disc$age)
#   pvals[i,1] = tres$p.value
#   tres = t.test(td_Rep_matched$age, asd_Rep$age)
#   pvals[i,2] = tres$p.value
#   #print(i)
# }
# a = sort.int(pvals[,1], decreasing = TRUE, index.return = TRUE)
# b=pvals[a$ix,]

seed2use = 929
res = findTDAgeMatch(data2write, seed_range = c(seed2use,seed2use))
td_Disc_matched = res[[2]]
td_Rep_matched = res[[1]]

mask = is.element(data2write$subid, td_Disc_matched$subid)
data2write$dataset[mask] = "Discovery"

mask = is.element(data2write$subid, td_Rep_matched$subid)
data2write$dataset[mask] = "Replication"

print(table(data2write$dataset, data2write$subgrp))
##              
##               RRB_over_SC SC_equal_RRB SC_over_RRB  TD
##   Discovery             8           65          60 121
##   Replication           3           67          63 122
fname2save = here(sprintf("asd_subgrp_data_rsfmri_ALL_DSM5_diffzscoreGrps_z%s.csv",as.character(z_thresh)))
write.csv(data2write,file = fname2save)


#------------------------------------------------------------------------------
# Descriptive stats
df2use = data2write[,c("subid","Diagnosis","dataset","Centre","meanFD","age","sex","subgrp","viq_all","piq_all","fsiq4_all","A_pct_severity","B_pct_severity","ADI_social_total","ADI_communication_total","ADI_RRB_total","ados_2_SA_CSS","ados_2_RRB_CSS","SRS_tscore","SRS_tscore_self","RBS_total","SSP_total","vabsdscoresc_dss","vabsdscoresd_dss","vabsdscoress_dss","vabsabcabc_standard")]
mask = df2use==999 | df2use==777
df2use[mask] = NA
df2use$age = df2use$age/365

cols2use = c("dataset","subgrp","age","meanFD","viq_all","piq_all","fsiq4_all","A_pct_severity","B_pct_severity","ADI_social_total","ADI_communication_total","ADI_RRB_total","ados_2_SA_CSS","ados_2_RRB_CSS","SRS_tscore","SRS_tscore_self","RBS_total","SSP_total","vabsdscoresc_dss","vabsdscoresd_dss","vabsdscoress_dss","vabsabcabc_standard")
res=describeBy(x = df2use[,cols2use], group = c("subgrp","dataset"))
res
## 
##  Descriptive statistics by group 
## subgrp: RRB_over_SC
## dataset: Discovery
##                         vars n   mean    sd median trimmed   mad    min    max
## dataset*                   1 8    NaN    NA     NA     NaN    NA    Inf   -Inf
## subgrp*                    2 8    NaN    NA     NA     NaN    NA    Inf   -Inf
## age                        3 8  17.53  7.05  21.73   17.53  2.56   7.89  23.88
## meanFD                     4 8   0.27  0.36   0.18    0.27  0.07   0.06   1.14
## viq_all                    5 8 103.86 13.94 107.50  103.86 12.60  78.00 123.85
## piq_all                    6 8 101.37  9.62  99.50  101.37  9.64  87.00 114.00
## fsiq4_all                  7 8 102.75 12.37 103.00  102.75 10.38  80.00 118.01
## A_pct_severity             8 8   0.20  0.11   0.19    0.20  0.13   0.05   0.35
## B_pct_severity             9 8   0.47  0.08   0.47    0.47  0.09   0.36   0.59
## ADI_social_total          10 8  17.50  7.80  20.50   17.50  7.41   5.00  26.00
## ADI_communication_total   11 8  16.25  6.30  17.00   16.25  7.41   6.00  24.00
## ADI_RRB_total             12 8   8.50  1.41   8.50    8.50  2.22   7.00  10.00
## ados_2_SA_CSS             13 8   3.75  2.87   3.00    3.75  2.22   1.00   9.00
## ados_2_RRB_CSS            14 8   3.88  3.98   1.00    3.88  0.00   1.00   9.00
## SRS_tscore                15 4  72.25 13.72  70.50   72.25 12.60  58.00  90.00
## SRS_tscore_self           16 4  59.00  7.12  61.00   59.00  4.45  49.00  65.00
## RBS_total                 17 4  18.00  8.83  17.50   18.00  8.90   8.00  29.00
## SSP_total                 18 3 142.00 24.06 140.00  142.00 31.13 119.00 167.00
## vabsdscoresc_dss          19 5  84.20 18.63  77.00   84.20 14.83  67.00 115.00
## vabsdscoresd_dss          20 5  69.00  9.46  74.00   69.00  7.41  57.00  79.00
## vabsdscoress_dss          21 5  70.40  8.99  71.00   70.40  2.97  57.00  82.00
## vabsabcabc_standard       22 5  72.80 10.76  72.00   72.80  7.41  59.00  88.00
##                         range  skew kurtosis    se
## dataset*                 -Inf    NA       NA    NA
## subgrp*                  -Inf    NA       NA    NA
## age                     16.00 -0.41    -1.98  2.49
## meanFD                   1.09  1.77     1.50  0.13
## viq_all                 45.85 -0.43    -0.93  4.93
## piq_all                 27.00  0.09    -1.56  3.40
## fsiq4_all               38.01 -0.40    -1.08  4.37
## A_pct_severity           0.30  0.01    -1.94  0.04
## B_pct_severity           0.23  0.05    -1.67  0.03
## ADI_social_total        21.00 -0.41    -1.67  2.76
## ADI_communication_total 18.00 -0.26    -1.52  2.23
## ADI_RRB_total            3.00  0.00    -2.05  0.50
## ados_2_SA_CSS            8.00  0.70    -1.16  1.01
## ados_2_RRB_CSS           8.00  0.44    -2.00  1.41
## SRS_tscore              32.00  0.24    -2.00  6.86
## SRS_tscore_self         16.00 -0.50    -1.88  3.56
## RBS_total               21.00  0.11    -1.98  4.42
## SSP_total               48.00  0.08    -2.33 13.89
## vabsdscoresc_dss        48.00  0.71    -1.35  8.33
## vabsdscoresd_dss        22.00 -0.23    -2.10  4.23
## vabsdscoress_dss        25.00 -0.23    -1.46  4.02
## vabsabcabc_standard     29.00  0.14    -1.65  4.81
## ------------------------------------------------------------ 
## subgrp: SC_equal_RRB
## dataset: Discovery
##                         vars  n   mean    sd median trimmed   mad   min    max
## dataset*                   1 65    NaN    NA     NA     NaN    NA   Inf   -Inf
## subgrp*                    2 65    NaN    NA     NA     NaN    NA   Inf   -Inf
## age                        3 65  16.05  5.71  14.85   15.76  4.76  7.08  30.28
## meanFD                     4 65   0.30  0.49   0.19    0.22  0.13  0.04   3.95
## viq_all                    5 64  96.39 18.31  96.17   96.58 22.24 61.00 136.00
## piq_all                    6 64  99.34 20.61 102.00   99.62 19.94 58.00 145.00
## fsiq4_all                  7 65  98.03 18.10 102.00   98.61 19.27 60.00 131.00
## A_pct_severity             8 65   0.31  0.14   0.31    0.31  0.15  0.00   0.63
## B_pct_severity             9 65   0.30  0.15   0.29    0.30  0.16  0.01   0.69
## ADI_social_total          10 65  17.00  6.94  18.00   17.36  7.41  2.00  27.00
## ADI_communication_total   11 65  13.66  6.03  14.00   13.70  7.41  0.00  26.00
## ADI_RRB_total             12 65   5.31  2.57   5.00    5.34  2.97  0.00  12.00
## ados_2_SA_CSS             13 64   6.44  2.51   7.00    6.54  2.97  1.00  10.00
## ados_2_RRB_CSS            14 64   5.09  2.79   5.50    5.10  2.22  1.00  10.00
## SRS_tscore                15 57  71.93 12.19  74.00   72.32 13.34 45.00  90.00
## SRS_tscore_self           16 31  61.26 11.36  61.00   60.92 14.83 43.00  89.00
## RBS_total                 17 55  18.24 14.19  17.00   16.51 13.34  0.00  60.00
## SSP_total                 18 36 134.92 29.75 136.50  134.90 33.36 81.00 189.00
## vabsdscoresc_dss          19 63  72.92 18.48  75.00   73.57 13.34 21.00 122.00
## vabsdscoresd_dss          20 62  72.68 16.92  71.00   72.48 11.86 25.00 131.00
## vabsdscoress_dss          21 63  71.03 15.63  73.00   72.14 13.34 20.00 104.00
## vabsabcabc_standard       22 62  70.87 13.57  72.00   71.20 10.38 20.00 103.00
##                          range  skew kurtosis   se
## dataset*                  -Inf    NA       NA   NA
## subgrp*                   -Inf    NA       NA   NA
## age                      23.20  0.51    -0.51 0.71
## meanFD                    3.91  6.50    45.73 0.06
## viq_all                  75.00 -0.12    -0.84 2.29
## piq_all                  87.00 -0.16    -0.61 2.58
## fsiq4_all                71.00 -0.29    -0.84 2.24
## A_pct_severity            0.63  0.02    -0.64 0.02
## B_pct_severity            0.68  0.40    -0.33 0.02
## ADI_social_total         25.00 -0.43    -0.93 0.86
## ADI_communication_total  26.00 -0.07    -0.81 0.75
## ADI_RRB_total            12.00 -0.02    -0.38 0.32
## ados_2_SA_CSS             9.00 -0.40    -0.97 0.31
## ados_2_RRB_CSS            9.00 -0.33    -1.05 0.35
## SRS_tscore               45.00 -0.25    -0.94 1.62
## SRS_tscore_self          46.00  0.23    -0.80 2.04
## RBS_total                60.00  1.04     0.66 1.91
## SSP_total               108.00 -0.01    -1.05 4.96
## vabsdscoresc_dss        101.00 -0.36     0.84 2.33
## vabsdscoresd_dss        106.00  0.23     1.98 2.15
## vabsdscoress_dss         84.00 -0.77     0.94 1.97
## vabsabcabc_standard      83.00 -0.59     2.44 1.72
## ------------------------------------------------------------ 
## subgrp: SC_over_RRB
## dataset: Discovery
##                         vars  n   mean    sd median trimmed   mad   min    max
## dataset*                   1 60    NaN    NA     NA     NaN    NA   Inf   -Inf
## subgrp*                    2 60    NaN    NA     NA     NaN    NA   Inf   -Inf
## age                        3 60  16.54  5.11  16.03   16.34  5.10  7.78  29.40
## meanFD                     4 60   0.23  0.23   0.15    0.17  0.10  0.03   1.08
## viq_all                    5 59  98.53 19.34 100.00   98.07 19.27 64.55 142.00
## piq_all                    6 59  99.61 22.46 103.00  100.53 20.76 52.43 150.00
## fsiq4_all                  7 60  99.23 19.52 101.97   99.71 19.60 59.00 143.00
## A_pct_severity             8 60   0.44  0.14   0.44    0.43  0.13  0.19   0.82
## B_pct_severity             9 60   0.16  0.10   0.15    0.16  0.11  0.00   0.40
## ADI_social_total          10 60  17.87  6.48  18.00   18.21  8.90  3.00  28.00
## ADI_communication_total   11 60  14.58  4.88  15.00   14.75  5.19  2.00  24.00
## ADI_RRB_total             12 60   3.00  1.95   3.00    2.88  1.48  0.00   8.00
## ados_2_SA_CSS             13 58   6.16  2.62   7.00    6.27  2.97  1.00  10.00
## ados_2_RRB_CSS            14 58   4.60  2.78   5.00    4.50  2.97  1.00  10.00
## SRS_tscore                15 53  73.09 11.53  74.00   73.58 10.38 44.00  95.00
## SRS_tscore_self           16 25  64.24 12.23  63.00   63.24  8.90 42.00  94.00
## RBS_total                 17 51  15.98 16.59  12.00   13.41 13.34  0.00  90.00
## SSP_total                 18 40 138.90 30.62 140.00  140.59 37.06 53.00 186.00
## vabsdscoresc_dss          19 55  70.89 15.67  72.00   72.38  8.90 21.00 103.00
## vabsdscoresd_dss          20 55  70.67 16.39  71.00   70.84 11.86 17.00 112.00
## vabsdscoress_dss          21 55  66.82 16.15  68.00   68.44 14.83 20.00  95.00
## vabsabcabc_standard       22 55  67.24 15.08  70.00   68.62 10.38  6.00  96.00
##                          range  skew kurtosis   se
## dataset*                  -Inf    NA       NA   NA
## subgrp*                   -Inf    NA       NA   NA
## age                      21.62  0.37    -0.47 0.66
## meanFD                    1.05  2.21     4.38 0.03
## viq_all                  77.45  0.18    -0.55 2.52
## piq_all                  97.57 -0.38    -0.51 2.92
## fsiq4_all                84.00 -0.20    -0.87 2.52
## A_pct_severity            0.63  0.26    -0.23 0.02
## B_pct_severity            0.40  0.33    -0.80 0.01
## ADI_social_total         25.00 -0.36    -0.86 0.84
## ADI_communication_total  22.00 -0.34    -0.36 0.63
## ADI_RRB_total             8.00  0.61    -0.15 0.25
## ados_2_SA_CSS             9.00 -0.41    -1.01 0.34
## ados_2_RRB_CSS            9.00 -0.14    -1.18 0.36
## SRS_tscore               51.00 -0.34    -0.29 1.58
## SRS_tscore_self          52.00  0.89     0.35 2.45
## RBS_total                90.00  2.09     6.08 2.32
## SSP_total               133.00 -0.50    -0.17 4.84
## vabsdscoresc_dss         82.00 -1.23     2.74 2.11
## vabsdscoresd_dss         95.00 -0.34     1.48 2.21
## vabsdscoress_dss         75.00 -0.98     0.93 2.18
## vabsabcabc_standard      90.00 -1.69     4.81 2.03
## ------------------------------------------------------------ 
## subgrp: TD
## dataset: Discovery
##                         vars   n   mean    sd median trimmed   mad    min
## dataset*                   1 121    NaN    NA     NA     NaN    NA    Inf
## subgrp*                    2 121    NaN    NA     NA     NaN    NA    Inf
## age                        3 121  16.83  5.23  16.65   16.73  5.69   7.22
## meanFD                     4 121   0.18  0.15   0.13    0.15  0.07   0.03
## viq_all                    5 119 104.52 19.70 105.00  105.03 19.27  46.00
## piq_all                    6 119 106.10 19.47 107.00  107.53 17.79  49.00
## fsiq4_all                  7 119 105.72 18.33 108.18  106.99 16.58  53.00
## A_pct_severity             8   0    NaN    NA     NA     NaN    NA    Inf
## B_pct_severity             9   0    NaN    NA     NA     NaN    NA    Inf
## ADI_social_total          10   0    NaN    NA     NA     NaN    NA    Inf
## ADI_communication_total   11   0    NaN    NA     NA     NaN    NA    Inf
## ADI_RRB_total             12   0    NaN    NA     NA     NaN    NA    Inf
## ados_2_SA_CSS             13   0    NaN    NA     NA     NaN    NA    Inf
## ados_2_RRB_CSS            14   0    NaN    NA     NA     NaN    NA    Inf
## SRS_tscore                15  68  47.84  9.40  45.00   46.32  5.19  37.00
## SRS_tscore_self           16  71  46.69  4.85  46.00   46.26  4.45  39.00
## RBS_total                 17  68   2.15  4.74   0.00    0.95  0.00   0.00
## SSP_total                 18  59 177.86 12.71 182.00  179.78  8.90 122.00
## vabsdscoresc_dss          19  34  91.97 25.44  99.50   93.50 21.50  21.00
## vabsdscoresd_dss          20  34  90.74 20.28  98.50   92.25 17.05  33.00
## vabsdscoress_dss          21  34  96.21 23.67 102.50   98.57 21.50  33.00
## vabsabcabc_standard       22  34  92.06 23.04  99.50   93.89 15.57  25.00
##                            max  range  skew kurtosis   se
## dataset*                  -Inf   -Inf    NA       NA   NA
## subgrp*                   -Inf   -Inf    NA       NA   NA
## age                      29.84  22.62  0.17    -0.51 0.48
## meanFD                    0.85   0.82  2.28     5.65 0.01
## viq_all                 160.00 114.00 -0.24     0.38 1.81
## piq_all                 147.00  98.00 -0.69     0.32 1.79
## fsiq4_all               142.00  89.00 -0.69     0.58 1.68
## A_pct_severity            -Inf   -Inf    NA       NA   NA
## B_pct_severity            -Inf   -Inf    NA       NA   NA
## ADI_social_total          -Inf   -Inf    NA       NA   NA
## ADI_communication_total   -Inf   -Inf    NA       NA   NA
## ADI_RRB_total             -Inf   -Inf    NA       NA   NA
## ados_2_SA_CSS             -Inf   -Inf    NA       NA   NA
## ados_2_RRB_CSS            -Inf   -Inf    NA       NA   NA
## SRS_tscore               76.00  39.00  1.54     1.44 1.14
## SRS_tscore_self          63.00  24.00  0.93     1.10 0.58
## RBS_total                27.00  27.00  3.13    10.87 0.57
## SSP_total               190.00  68.00 -1.90     4.85 1.66
## vabsdscoresc_dss        138.00 117.00 -0.71     0.36 4.36
## vabsdscoresd_dss        121.00  88.00 -0.80     0.07 3.48
## vabsdscoress_dss        129.00  96.00 -0.83    -0.31 4.06
## vabsabcabc_standard     127.00 102.00 -0.90     0.30 3.95
## ------------------------------------------------------------ 
## subgrp: RRB_over_SC
## dataset: Replication
##                         vars n   mean    sd median trimmed   mad    min    max
## dataset*                   1 3    NaN    NA     NA     NaN    NA    Inf   -Inf
## subgrp*                    2 3    NaN    NA     NA     NaN    NA    Inf   -Inf
## age                        3 3  14.54  4.39  12.61   14.54  1.72  11.45  19.56
## meanFD                     4 3   0.23  0.14   0.20    0.23  0.15   0.10   0.38
## viq_all                    5 3 111.00 28.00  99.00  111.00 11.86  91.00 143.00
## piq_all                    6 3 119.33 29.54 121.00  119.33 40.03  89.00 148.00
## fsiq4_all                  7 3 115.67 28.75 106.00  115.67 19.27  93.00 148.00
## A_pct_severity             8 3   0.20  0.07   0.16    0.20  0.01   0.15   0.27
## B_pct_severity             9 3   0.40  0.05   0.40    0.40  0.08   0.35   0.45
## ADI_social_total          10 3  15.67  2.52  16.00   15.67  2.97  13.00  18.00
## ADI_communication_total   11 3   8.67  2.52   9.00    8.67  2.97   6.00  11.00
## ADI_RRB_total             12 3   5.67  1.53   6.00    5.67  1.48   4.00   7.00
## ados_2_SA_CSS             13 3   4.67  2.52   5.00    4.67  2.97   2.00   7.00
## ados_2_RRB_CSS            14 3   4.33  3.06   5.00    4.33  2.97   1.00   7.00
## SRS_tscore                15 3  68.00  6.93  72.00   68.00  0.00  60.00  72.00
## SRS_tscore_self           16 1  67.00    NA  67.00   67.00  0.00  67.00  67.00
## RBS_total                 17 3  12.67  5.51  13.00   12.67  7.41   7.00  18.00
## SSP_total                 18 2 146.00  9.90 146.00  146.00 10.38 139.00 153.00
## vabsdscoresc_dss          19 3  82.33 14.43  74.00   82.33  0.00  74.00  99.00
## vabsdscoresd_dss          20 3  69.33  4.16  68.00   69.33  2.97  66.00  74.00
## vabsdscoress_dss          21 3  84.33  9.71  82.00   84.33  8.90  76.00  95.00
## vabsabcabc_standard       22 3  65.67 23.18  77.00   65.67  5.93  39.00  81.00
##                         range  skew kurtosis    se
## dataset*                 -Inf    NA       NA    NA
## subgrp*                  -Inf    NA       NA    NA
## age                      8.11  0.35    -2.33  2.53
## meanFD                   0.28  0.18    -2.33  0.08
## viq_all                 52.00  0.35    -2.33 16.17
## piq_all                 59.00 -0.06    -2.33 17.05
## fsiq4_all               55.00  0.30    -2.33 16.60
## A_pct_severity           0.12  0.38    -2.33  0.04
## B_pct_severity           0.11 -0.02    -2.33  0.03
## ADI_social_total         5.00 -0.13    -2.33  1.45
## ADI_communication_total  5.00 -0.13    -2.33  1.45
## ADI_RRB_total            3.00 -0.21    -2.33  0.88
## ados_2_SA_CSS            5.00 -0.13    -2.33  1.45
## ados_2_RRB_CSS           6.00 -0.21    -2.33  1.76
## SRS_tscore              12.00 -0.38    -2.33  4.00
## SRS_tscore_self          0.00    NA       NA    NA
## RBS_total               11.00 -0.06    -2.33  3.18
## SSP_total               14.00  0.00    -2.75  7.00
## vabsdscoresc_dss        25.00  0.38    -2.33  8.33
## vabsdscoresd_dss         8.00  0.29    -2.33  2.40
## vabsdscoress_dss        19.00  0.23    -2.33  5.61
## vabsabcabc_standard     42.00 -0.37    -2.33 13.38
## ------------------------------------------------------------ 
## subgrp: SC_equal_RRB
## dataset: Replication
##                         vars  n   mean    sd median trimmed   mad   min    max
## dataset*                   1 67    NaN    NA     NA     NaN    NA   Inf   -Inf
## subgrp*                    2 67    NaN    NA     NA     NaN    NA   Inf   -Inf
## age                        3 67  16.58  5.91  16.07   16.25  6.51  7.12  30.15
## meanFD                     4 67   0.26  0.30   0.19    0.20  0.12  0.05   1.60
## viq_all                    5 66 102.33 15.72 103.36  102.83 17.61 70.00 133.00
## piq_all                    6 66 103.66 18.68 105.98  104.81 18.57 52.00 134.00
## fsiq4_all                  7 66 103.29 16.70 105.50  104.06 17.42 64.00 131.00
## A_pct_severity             8 67   0.26  0.12   0.24    0.26  0.12  0.04   0.65
## B_pct_severity             9 67   0.24  0.13   0.23    0.23  0.12  0.00   0.67
## ADI_social_total          10 67  14.70  6.22  15.00   14.91  5.93  1.00  27.00
## ADI_communication_total   11 67  11.58  5.53  11.00   11.51  5.93  0.00  24.00
## ADI_RRB_total             12 67   3.90  2.28   4.00    3.84  1.48  0.00   9.00
## ados_2_SA_CSS             13 65   5.62  2.47   6.00    5.68  2.97  1.00  10.00
## ados_2_RRB_CSS            14 65   4.94  2.54   6.00    4.92  1.48  1.00  10.00
## SRS_tscore                15 60  66.20 11.19  67.00   66.00 11.86 43.00  90.00
## SRS_tscore_self           16 32  61.81  8.16  62.00   61.62  6.67 46.00  84.00
## RBS_total                 17 57  13.54 11.27  11.00   12.28 10.38  0.00  52.00
## SSP_total                 18 37 140.19 26.49 142.00  142.13 32.62 69.00 177.00
## vabsdscoresc_dss          19 61  82.59 14.53  81.00   81.78 13.34 50.00 122.00
## vabsdscoresd_dss          20 60  79.18 16.08  78.50   78.67 12.60 38.00 119.00
## vabsdscoress_dss          21 61  76.48 16.12  78.00   77.33 13.34 28.00 112.00
## vabsabcabc_standard       22 60  78.53 12.93  77.50   78.02  9.64 48.00 117.00
##                          range  skew kurtosis   se
## dataset*                  -Inf    NA       NA   NA
## subgrp*                   -Inf    NA       NA   NA
## age                      23.03  0.42    -0.64 0.72
## meanFD                    1.55  3.19    10.73 0.04
## viq_all                  63.00 -0.23    -0.85 1.94
## piq_all                  82.00 -0.56    -0.03 2.30
## fsiq4_all                67.00 -0.40    -0.56 2.06
## A_pct_severity            0.61  0.43     0.29 0.01
## B_pct_severity            0.67  0.72     0.90 0.02
## ADI_social_total         26.00 -0.25    -0.77 0.76
## ADI_communication_total  24.00  0.10    -0.59 0.68
## ADI_RRB_total             9.00  0.26    -0.52 0.28
## ados_2_SA_CSS             9.00 -0.23    -0.85 0.31
## ados_2_RRB_CSS            9.00 -0.40    -0.78 0.31
## SRS_tscore               47.00  0.12    -0.76 1.44
## SRS_tscore_self          38.00  0.42     0.53 1.44
## RBS_total                52.00  1.23     1.60 1.49
## SSP_total               108.00 -0.62    -0.39 4.36
## vabsdscoresc_dss         72.00  0.50     0.03 1.86
## vabsdscoresd_dss         81.00  0.22     0.17 2.08
## vabsdscoress_dss         84.00 -0.62     0.97 2.06
## vabsabcabc_standard      69.00  0.45     0.40 1.67
## ------------------------------------------------------------ 
## subgrp: SC_over_RRB
## dataset: Replication
##                         vars  n   mean    sd median trimmed   mad   min    max
## dataset*                   1 63    NaN    NA     NA     NaN    NA   Inf   -Inf
## subgrp*                    2 63    NaN    NA     NA     NaN    NA   Inf   -Inf
## age                        3 63  16.30  5.10  16.07   16.17  5.91  7.48  29.23
## meanFD                     4 63   0.22  0.20   0.16    0.19  0.10  0.04   1.31
## viq_all                    5 60  95.21 19.99  98.00   96.37 21.50 50.91 130.00
## piq_all                    6 62  98.09 21.23 103.50   99.53 19.27 44.03 138.00
## fsiq4_all                  7 61  97.35 19.71 103.00   98.23 19.64 59.00 139.00
## A_pct_severity             8 63   0.48  0.14   0.49    0.48  0.16  0.19   0.75
## B_pct_severity             9 63   0.20  0.12   0.20    0.19  0.15  0.00   0.47
## ADI_social_total          10 63  18.67  5.75  19.00   18.92  5.93  6.00  29.00
## ADI_communication_total   11 63  15.11  4.71  16.00   15.29  4.45  4.00  24.00
## ADI_RRB_total             12 63   3.90  2.52   4.00    3.76  2.97  0.00  10.00
## ados_2_SA_CSS             13 60   6.18  2.89   6.00    6.29  4.45  1.00  10.00
## ados_2_RRB_CSS            14 60   4.53  2.76   5.00    4.42  2.97  1.00   9.00
## SRS_tscore                15 55  75.04 11.69  78.00   75.98 13.34 48.00  90.00
## SRS_tscore_self           16 30  62.40  9.92  61.00   62.21  9.64 40.00  84.00
## RBS_total                 17 55  19.35 15.40  15.00   17.69 11.86  0.00  73.00
## SSP_total                 18 44 138.73 25.65 139.50  138.97 28.17 91.00 184.00
## vabsdscoresc_dss          19 57  72.02 14.52  72.00   72.81 11.86 21.00 102.00
## vabsdscoresd_dss          20 57  69.95 15.18  68.00   69.74 14.83 42.00 118.00
## vabsdscoress_dss          21 57  65.23 14.98  66.00   65.62 14.83 23.00 100.00
## vabsabcabc_standard       22 57  67.11 12.99  68.00   67.70  8.90 28.00  94.00
##                         range  skew kurtosis   se
## dataset*                 -Inf    NA       NA   NA
## subgrp*                  -Inf    NA       NA   NA
## age                     21.75  0.27    -0.76 0.64
## meanFD                   1.26  3.14    13.22 0.02
## viq_all                 79.09 -0.45    -0.67 2.58
## piq_all                 93.97 -0.54    -0.57 2.70
## fsiq4_all               80.00 -0.37    -0.81 2.52
## A_pct_severity           0.56 -0.07    -0.96 0.02
## B_pct_severity           0.47  0.32    -0.80 0.01
## ADI_social_total        23.00 -0.38    -0.68 0.72
## ADI_communication_total 20.00 -0.37    -0.80 0.59
## ADI_RRB_total           10.00  0.45    -0.67 0.32
## ados_2_SA_CSS            9.00 -0.17    -1.24 0.37
## ados_2_RRB_CSS           8.00 -0.13    -1.29 0.36
## SRS_tscore              42.00 -0.59    -0.67 1.58
## SRS_tscore_self         44.00  0.11    -0.23 1.81
## RBS_total               73.00  1.13     1.07 2.08
## SSP_total               93.00 -0.13    -0.98 3.87
## vabsdscoresc_dss        81.00 -0.91     2.25 1.92
## vabsdscoresd_dss        76.00  0.44     0.20 2.01
## vabsdscoress_dss        77.00 -0.32     0.14 1.98
## vabsabcabc_standard     66.00 -0.60     1.15 1.72
## ------------------------------------------------------------ 
## subgrp: TD
## dataset: Replication
##                         vars   n   mean    sd median trimmed   mad   min    max
## dataset*                   1 122    NaN    NA     NA     NaN    NA   Inf   -Inf
## subgrp*                    2 122    NaN    NA     NA     NaN    NA   Inf   -Inf
## age                        3 122  16.86  6.07  16.34   16.58  7.59  6.89  29.72
## meanFD                     4 122   0.23  0.46   0.14    0.15  0.07  0.04   4.60
## viq_all                    5 122 104.02 17.58 108.18  105.64 12.13 45.00 140.00
## piq_all                    6 122 104.64 18.41 108.96  106.56 14.08 49.00 139.00
## fsiq4_all                  7 122 104.94 17.14 108.09  107.29 11.86 50.00 134.00
## A_pct_severity             8   0    NaN    NA     NA     NaN    NA   Inf   -Inf
## B_pct_severity             9   0    NaN    NA     NA     NaN    NA   Inf   -Inf
## ADI_social_total          10   0    NaN    NA     NA     NaN    NA   Inf   -Inf
## ADI_communication_total   11   0    NaN    NA     NA     NaN    NA   Inf   -Inf
## ADI_RRB_total             12   0    NaN    NA     NA     NaN    NA   Inf   -Inf
## ados_2_SA_CSS             13   0    NaN    NA     NA     NaN    NA   Inf   -Inf
## ados_2_RRB_CSS            14   0    NaN    NA     NA     NaN    NA   Inf   -Inf
## SRS_tscore                15  65  47.23  9.34  44.00   45.66  4.45 37.00  90.00
## SRS_tscore_self           16  61  48.44  6.84  47.00   47.63  5.93 39.00  69.00
## RBS_total                 17  63   3.08 11.54   0.00    0.86  0.00  0.00  89.00
## SSP_total                 18  54 174.93 19.38 182.00  178.41  6.67 75.00 190.00
## vabsdscoresc_dss          19  39  92.74 25.45  96.00   95.82 20.76 21.00 125.00
## vabsdscoresd_dss          20  39  91.10 22.65  97.00   93.91 14.83 27.00 122.00
## vabsdscoress_dss          21  39  98.90 27.04 103.00  102.12 17.79 20.00 132.00
## vabsabcabc_standard       22  38  93.00 25.39 100.00   96.44 15.57 20.00 126.00
##                          range  skew kurtosis   se
## dataset*                  -Inf    NA       NA   NA
## subgrp*                   -Inf    NA       NA   NA
## age                      22.83  0.33    -0.97 0.55
## meanFD                    4.56  7.54    64.83 0.04
## viq_all                  95.00 -0.99     1.51 1.59
## piq_all                  90.00 -0.93     0.67 1.67
## fsiq4_all                84.00 -1.28     1.67 1.55
## A_pct_severity            -Inf    NA       NA   NA
## B_pct_severity            -Inf    NA       NA   NA
## ADI_social_total          -Inf    NA       NA   NA
## ADI_communication_total   -Inf    NA       NA   NA
## ADI_RRB_total             -Inf    NA       NA   NA
## ados_2_SA_CSS             -Inf    NA       NA   NA
## ados_2_RRB_CSS            -Inf    NA       NA   NA
## SRS_tscore               53.00  2.14     5.82 1.16
## SRS_tscore_self          30.00  1.05     0.48 0.88
## RBS_total                89.00  6.60    45.89 1.45
## SSP_total               115.00 -2.88    10.92 2.64
## vabsdscoresc_dss        104.00 -1.21     1.00 4.08
## vabsdscoresd_dss         95.00 -1.34     1.26 3.63
## vabsdscoress_dss        112.00 -1.26     1.10 4.33
## vabsabcabc_standard     106.00 -1.42     1.40 4.12
#------------------------------------------------------------------------------
# Table of subtypes X sex

# Discovery
data2use = subset(data2write,data2write$dataset=="Discovery")
table(data2use$subgrp,data2use$sex)
##               
##                Female Male
##   RRB_over_SC       4    4
##   SC_equal_RRB     16   49
##   SC_over_RRB      15   45
##   TD               41   80
cs_res = chisq.test(data2use$subgrp,data2use$sex)
cs_res
## 
##  Pearson's Chi-squared test
## 
## data:  data2use$subgrp and data2use$sex
## X-squared = 4.0102, df = 3, p-value = 0.2604
# Replication
data2use = subset(data2write,data2write$dataset=="Replication")
table(data2use$subgrp,data2use$sex)
##               
##                Female Male
##   RRB_over_SC       1    2
##   SC_equal_RRB     17   50
##   SC_over_RRB      18   45
##   TD               47   75
cs_res = chisq.test(data2use$subgrp,data2use$sex)
cs_res
## 
##  Pearson's Chi-squared test
## 
## data:  data2use$subgrp and data2use$sex
## X-squared = 4.0105, df = 3, p-value = 0.2603
#------------------------------------------------------------------------------
vars2analyze = c("age","meanFD","viq_all","piq_all","fsiq4_all",
                 "A_pct_severity","B_pct_severity",
                 "ADI_social_total","ADI_communication_total","ADI_RRB_total",
                 "ados_2_SA_CSS","ados_2_RRB_CSS",
                 "SRS_tscore_self","RBS_total","SSP_total",
                 "vabsdscoress_dss","vabsdscoresd_dss","vabsdscoresc_dss","vabsabcabc_standard")

vnames = c("Age","Mean FD","VIQ","PIQ","FIQ","ADI-R SC","ADI-R RRB","ADI-R Social","ADI-R Communication","ADI-R RRB","ADOS SA CSS","ADOS RRB CSS","SRS","RBS","SSP","Vineland Socialization","Vineland Daily Living Skills","Vineland Communication","Vineland ABC")

cnames = c("All_Disc.fstat","All_Disc.pval","All_Rep.fstat","All_Rep.pval",
           "SCequalRRB_vs_SCoverRRB_Disc.fstat","SCequalRRB_vs_SCoverRRB_Disc.tstat",
           "SCequalRRB_vs_SCoverRRB_Disc.pval","SCequalRRB_vs_SCoverRRB_Disc.es",
           "SCequalRRB_vs_SCoverRRB_Rep.fstat","SCequalRRB_vs_SCoverRRB_Rep.tstat",
           "SCequalRRB_vs_SCoverRRB_Rep.pval","SCequalRRB_vs_SCoverRRB_Rep.es",
           "SCequalRRB_vs_SCoverRRB.repBF")

output_res = data.frame(matrix(nrow = length(vars2analyze),ncol = length(cnames)))
colnames(output_res) = cnames
rownames(output_res) = vars2analyze
output_res$varNames = vars2analyze

for (ivar in 1:length(vars2analyze)){

  y_var = vars2analyze[ivar]
  # print(y_var)
  #----------------------------------------------------------------------------
  # Discovery
  df4mod = subset(df2use, df2use$dataset=="Discovery")

  # construct linear model
  # mixed-effect model: site as random factor, all other covariates as fixed factors
  fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
  rx_form = as.formula(sprintf("~ 1|%s","Centre"))
  mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))

  # run ANOVA
  res = anova(mod2use)
  output_res[y_var,"All_Disc.fstat"] = res["subgrp","F-value"]
  output_res[y_var,"All_Disc.pval"] = res["subgrp","p-value"]

  #----------------------------------------------------------------------------
  # Replication
  df4mod = subset(df2use, df2use$dataset=="Replication")

  # construct linear model
  # mixed-effect model: site as random factor, all other covariates as fixed factors
  fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
  rx_form = as.formula(sprintf("~ 1|%s","Centre"))
  mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))

  # run ANOVA
  res = anova(mod2use)
  output_res[y_var,"All_Rep.fstat"] = res["subgrp","F-value"]
  output_res[y_var,"All_Rep.pval"] = res["subgrp","p-value"]

  #----------------------------------------------------------------------------
  # Discovery
  df4mod = subset(df2use, df2use$dataset=="Discovery" & !is.element(df2use$subgrp,c("TD","RRB_over_SC")))
  n1 = sum(df4mod$subgrp=="SC_equal_RRB")
  m1 = sum(df4mod$subgrp=="SC_over_RRB")

  # construct linear model
  # mixed-effect model: site as random factor, all other covariates as fixed factors
  fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
  rx_form = as.formula(sprintf("~ 1|%s","Centre"))
  mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))

  # run ANOVA
  res = anova(mod2use)
  output_res[y_var,"SCequalRRB_vs_SCoverRRB_Disc.fstat"] = res["subgrp","F-value"]
  output_res[y_var,"SCequalRRB_vs_SCoverRRB_Disc.pval"] = res["subgrp","p-value"]
  res = summary(mod2use)
  output_res[y_var,"SCequalRRB_vs_SCoverRRB_Disc.tstat"] = res$tTable[2,"t-value"]
  output_res[y_var,"SCequalRRB_vs_SCoverRRB_Disc.es"] = cohens_d(df4mod[df4mod$subgrp=="SC_equal_RRB",y_var],
                                                                 df4mod[df4mod$subgrp=="SC_over_RRB",y_var])

  #----------------------------------------------------------------------------
  # Replication
  df4mod = subset(df2use, df2use$dataset=="Replication" & !is.element(df2use$subgrp,c("TD","RRB_over_SC")))
  n2 = sum(df4mod$subgrp=="SC_equal_RRB")
  m2 = sum(df4mod$subgrp=="SC_over_RRB")

  # construct linear model
  # mixed-effect model: site as random factor, all other covariates as fixed factors
  fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
  rx_form = as.formula(sprintf("~ 1|%s","Centre"))
  mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))

  # run ANOVA
  res = anova(mod2use)
  output_res[y_var,"SCequalRRB_vs_SCoverRRB_Rep.fstat"] = res["subgrp","F-value"]
  output_res[y_var,"SCequalRRB_vs_SCoverRRB_Rep.pval"] = res["subgrp","p-value"]
  res = summary(mod2use)
  output_res[y_var,"SCequalRRB_vs_SCoverRRB_Rep.tstat"] = res$tTable[2,"t-value"]
  output_res[y_var,"SCequalRRB_vs_SCoverRRB_Rep.es"] = cohens_d(df4mod[df4mod$subgrp=="SC_equal_RRB",y_var],
                                                                 df4mod[df4mod$subgrp=="SC_over_RRB",y_var])

  #----------------------------------------------------------------------------
  # Replication Bayes Factor
  res_bf = BFSALL(tobs = output_res[y_var,"SCequalRRB_vs_SCoverRRB_Disc.tstat"],
                  trep = output_res[y_var,"SCequalRRB_vs_SCoverRRB_Disc.tstat"],
                  n1 = n1,
                  n2 = n2,
                  m1 = m1,
                  m2 = m2,
                  sample = 2,
                  Type = 'ALL')
  output_res[y_var,"SCequalRRB_vs_SCoverRRB.repBF"] = res_bf["Replication BF","Replication 1"]

  # make a plot
  colors2use = get_ggColorHue(3)
  df4plot = subset(df2use, df2use$subgrp!="RRB_over_SC")
  p = ggplot(data = df4plot, aes_string(x = "subgrp", y = y_var, colour = "subgrp")) + facet_grid(. ~ dataset)
  p = p + geom_jitter() + geom_boxplot(fill = NA, colour = "#000000", outlier.shape = NA)
  p = p + ylab(vnames[ivar]) + xlab("Group") +
    scale_x_discrete(labels=c("SC_equal_RRB"="SC=RRB","SC_over_RRB"="SC>RRB","TD"="TD")) +
                       scale_colour_manual(values = c(colors2use[2:3],"grey60")) + guides(colour=FALSE) +
    theme(text = element_text(size=fontSize-5),
        axis.text.x = element_text(size=fontSize-5),
        axis.text.y = element_text(size=fontSize-5))
  print(p)

}

vabc["0.8","Discovery"] = output_res["vabsabcabc_standard","SCequalRRB_vs_SCoverRRB_Disc.es"]
vabc["0.8","Replication"] = output_res["vabsabcabc_standard","SCequalRRB_vs_SCoverRRB_Rep.es"]
vabc_dls["0.8","Discovery"] = output_res["vabsdscoresd_dss","SCequalRRB_vs_SCoverRRB_Disc.es"]
vabc_dls["0.8","Replication"] = output_res["vabsdscoresd_dss","SCequalRRB_vs_SCoverRRB_Rep.es"]
output_res
##                         All_Disc.fstat All_Disc.pval All_Rep.fstat All_Rep.pval
## age                          0.3468322  7.914756e-01     0.5287789 6.629189e-01
## meanFD                       2.4076706  6.776486e-02     0.1930085 9.011052e-01
## viq_all                      2.5251084  5.821669e-02     2.4621363 6.317097e-02
## piq_all                      1.5758519  1.958225e-01     1.0482434 3.718537e-01
## fsiq4_all                    2.4129618  6.732296e-02     1.5741816 1.962065e-01
## A_pct_severity              26.7618096  2.002742e-10    59.8316827 0.000000e+00
## B_pct_severity              30.5403989  1.481371e-11     3.2122715 4.355799e-02
## ADI_social_total             1.1411638  3.227004e-01     9.6503824 1.254231e-04
## ADI_communication_total      1.5717852  2.116868e-01    12.4266051 1.177914e-05
## ADI_RRB_total               27.4062031  1.274802e-10     0.5610312 5.720280e-01
## ados_2_SA_CSS                3.7940808  2.515948e-02     1.3359348 2.667306e-01
## ados_2_RRB_CSS               1.2774997  2.823758e-01     0.5571873 5.742687e-01
## SRS_tscore_self             38.5195895  0.000000e+00    31.9168435 3.996803e-15
## RBS_total                   18.7453428  1.495031e-10    15.3366627 7.082595e-09
## SSP_total                   30.5672415  5.107026e-15    22.8983017 5.992984e-12
## vabsdscoress_dss            23.4592386  1.657785e-12    25.1597168 2.745582e-13
## vabsdscoresd_dss            11.8354963  5.357631e-07    10.9006901 1.584419e-06
## vabsdscoresc_dss             9.7808359  6.217169e-06     9.8065124 5.902403e-06
## vabsabcabc_standard         18.3979940  3.298979e-10    18.1084676 4.323943e-10
##                         SCequalRRB_vs_SCoverRRB_Disc.fstat
## age                                             0.26192656
## meanFD                                          0.97946508
## viq_all                                         0.30231080
## piq_all                                         0.01015093
## fsiq4_all                                       0.12578085
## A_pct_severity                                 34.73241475
## B_pct_severity                                 39.14477824
## ADI_social_total                                1.47054314
## ADI_communication_total                         2.92929727
## ADI_RRB_total                                  30.84312183
## ados_2_SA_CSS                                   0.35270821
## ados_2_RRB_CSS                                  0.78531699
## SRS_tscore_self                                 2.80844616
## RBS_total                                       0.25978094
## SSP_total                                       0.48083177
## vabsdscoress_dss                                3.28380267
## vabsdscoresd_dss                                1.01228803
## vabsdscoresc_dss                                0.43999328
## vabsabcabc_standard                             2.99819941
##                         SCequalRRB_vs_SCoverRRB_Disc.tstat
## age                                              0.5117876
## meanFD                                          -0.9896793
## viq_all                                          0.5498280
## piq_all                                          0.1007518
## fsiq4_all                                        0.3546560
## A_pct_severity                                   5.8934213
## B_pct_severity                                  -6.2565788
## ADI_social_total                                 1.2126595
## ADI_communication_total                          1.7115190
## ADI_RRB_total                                   -5.5536584
## ados_2_SA_CSS                                   -0.5938924
## ados_2_RRB_CSS                                  -0.8861811
## SRS_tscore_self                                  1.6758419
## RBS_total                                       -0.5096871
## SSP_total                                        0.6934203
## vabsdscoress_dss                                -1.8121266
## vabsdscoresd_dss                                -1.0061253
## vabsdscoresc_dss                                -0.6633199
## vabsabcabc_standard                             -1.7315309
##                         SCequalRRB_vs_SCoverRRB_Disc.pval
## age                                          6.097402e-01
## meanFD                                       3.243223e-01
## viq_all                                      5.834766e-01
## piq_all                                      9.199185e-01
## fsiq4_all                                    7.234699e-01
## A_pct_severity                               3.550766e-08
## B_pct_severity                               6.295433e-09
## ADI_social_total                             2.276420e-01
## ADI_communication_total                      8.956880e-02
## ADI_RRB_total                                1.708319e-07
## ados_2_SA_CSS                                5.537306e-01
## ados_2_RRB_CSS                               3.773378e-01
## SRS_tscore_self                              9.989021e-02
## RBS_total                                    6.113826e-01
## SSP_total                                    4.903081e-01
## vabsdscoress_dss                             7.262213e-02
## vabsdscoresd_dss                             3.165241e-01
## vabsdscoresc_dss                             5.084766e-01
## vabsabcabc_standard                          8.611059e-02
##                         SCequalRRB_vs_SCoverRRB_Disc.es
## age                                         -0.08989673
## meanFD                                       0.17718101
## viq_all                                     -0.11356733
## piq_all                                     -0.01252847
## fsiq4_all                                   -0.06349360
## A_pct_severity                              -0.89512431
## B_pct_severity                               1.09777877
## ADI_social_total                            -0.12891662
## ADI_communication_total                     -0.16745540
## ADI_RRB_total                                1.00708452
## ados_2_SA_CSS                                0.11005817
## ados_2_RRB_CSS                               0.17618905
## SRS_tscore_self                             -0.25296265
## RBS_total                                    0.14661093
## SSP_total                                   -0.13203186
## vabsdscoress_dss                             0.26526047
## vabsdscoresd_dss                             0.12028055
## vabsdscoresc_dss                             0.11808163
## vabsabcabc_standard                          0.25391669
##                         SCequalRRB_vs_SCoverRRB_Rep.fstat
## age                                            0.08151188
## meanFD                                         0.82655445
## viq_all                                        3.72814980
## piq_all                                        1.08570322
## fsiq4_all                                      1.85608227
## A_pct_severity                               113.29691674
## B_pct_severity                                 1.51006239
## ADI_social_total                              18.96724209
## ADI_communication_total                       21.24526531
## ADI_RRB_total                                  0.20305007
## ados_2_SA_CSS                                  1.66094873
## ados_2_RRB_CSS                                 1.08347382
## SRS_tscore_self                                0.06525365
## RBS_total                                      8.02057567
## SSP_total                                      0.17011474
## vabsdscoress_dss                              18.72611076
## vabsdscoresd_dss                              12.09828145
## vabsdscoresc_dss                              15.60770992
## vabsabcabc_standard                           27.59806183
##                         SCequalRRB_vs_SCoverRRB_Rep.tstat
## age                                            -0.2855028
## meanFD                                         -0.9091504
## viq_all                                        -1.9308417
## piq_all                                        -1.0419708
## fsiq4_all                                      -1.3623811
## A_pct_severity                                 10.6441024
## B_pct_severity                                 -1.2288460
## ADI_social_total                                4.3551397
## ADI_communication_total                         4.6092587
## ADI_RRB_total                                   0.4506108
## ados_2_SA_CSS                                   1.2887780
## ados_2_RRB_CSS                                 -1.0409005
## SRS_tscore_self                                 0.2554479
## RBS_total                                       2.8320621
## SSP_total                                      -0.4124497
## vabsdscoress_dss                               -4.3273676
## vabsdscoresd_dss                               -3.4782584
## vabsdscoresc_dss                               -3.9506594
## vabsabcabc_standard                            -5.2533857
##                         SCequalRRB_vs_SCoverRRB_Rep.pval
## age                                         7.757315e-01
## meanFD                                      3.650204e-01
## viq_all                                     5.584194e-02
## piq_all                                     2.994689e-01
## fsiq4_all                                   1.755875e-01
## A_pct_severity                              0.000000e+00
## B_pct_severity                              2.214380e-01
## ADI_social_total                            2.741201e-05
## ADI_communication_total                     9.830284e-06
## ADI_RRB_total                               6.530507e-01
## ados_2_SA_CSS                               1.999543e-01
## ados_2_RRB_CSS                              3.000144e-01
## SRS_tscore_self                             7.992967e-01
## RBS_total                                   5.527113e-03
## SSP_total                                   6.811710e-01
## vabsdscoress_dss                            3.274276e-05
## vabsdscoresd_dss                            7.197026e-04
## vabsdscoresc_dss                            1.360567e-04
## vabsabcabc_standard                         7.194943e-07
##                         SCequalRRB_vs_SCoverRRB_Rep.es
## age                                        0.050104241
## meanFD                                     0.159551075
## viq_all                                    0.397233907
## piq_all                                    0.278814494
## fsiq4_all                                  0.325765875
## A_pct_severity                            -1.660514857
## B_pct_severity                             0.296042356
## ADI_social_total                          -0.661290880
## ADI_communication_total                   -0.684879764
## ADI_RRB_total                             -0.003853938
## ados_2_SA_CSS                             -0.211654450
## ados_2_RRB_CSS                             0.152912314
## SRS_tscore_self                           -0.064904510
## RBS_total                                 -0.431994003
## SSP_total                                  0.056040468
## vabsdscoress_dss                           0.721996648
## vabsdscoresd_dss                           0.590093340
## vabsdscoresc_dss                           0.727794865
## vabsabcabc_standard                        0.882109733
##                         SCequalRRB_vs_SCoverRRB.repBF                varNames
## age                                      7.988837e-01                     age
## meanFD                                   1.141523e+00                  meanFD
## viq_all                                  8.138378e-01                 viq_all
## piq_all                                  7.037967e-01                 piq_all
## fsiq4_all                                7.457033e-01               fsiq4_all
## A_pct_severity                           3.481048e+06          A_pct_severity
## B_pct_severity                           1.931946e+07          B_pct_severity
## ADI_social_total                         1.458599e+00        ADI_social_total
## ADI_communication_total                  2.997668e+00 ADI_communication_total
## ADI_RRB_total                            7.330776e+05           ADI_RRB_total
## ados_2_SA_CSS                            8.360759e-01           ados_2_SA_CSS
## ados_2_RRB_CSS                           1.035729e+00          ados_2_RRB_CSS
## SRS_tscore_self                          2.826856e+00         SRS_tscore_self
## RBS_total                                7.973627e-01               RBS_total
## SSP_total                                8.914452e-01               SSP_total
## vabsdscoress_dss                         3.562055e+00        vabsdscoress_dss
## vabsdscoresd_dss                         1.161038e+00        vabsdscoresd_dss
## vabsdscoresc_dss                         8.714754e-01        vabsdscoresc_dss
## vabsabcabc_standard                      3.095480e+00     vabsabcabc_standard

SC-RRB difference z = 0.9

#------------------------------------------------------------------------------
# Z-score threshold to use for subtyping
z_thresh = 0.9

# vars2use = c("dbaes_atotal","dbaes_btotal")

# compute Discovery mean and sd
ds_disc = Dverbal_Discovery[,vars2use[1]] - Dverbal_Discovery[,vars2use[2]]
mean2use = mean(ds_disc)
sd2use = sd(ds_disc)

Dverbal_Discovery = make_subtype(data2use = Dverbal_Discovery,
                                 z_thresh = z_thresh,
                                 mean2use = mean2use,
                                 sd2use = sd2use)

# compute Replication mean and sd
ds_rep = Dverbal_Replication[,vars2use[1]] - Dverbal_Replication[,vars2use[2]]
mean2use = mean(ds_rep)
sd2use = sd(ds_rep)

Dverbal_Replication = make_subtype(data2use = Dverbal_Replication,
                                 z_thresh = z_thresh,
                                 mean2use = mean2use,
                                 sd2use = sd2use)

#------------------------------------------------------------------------------
# Table of subtypes X sex

# Discovery
table(Dverbal_Discovery$z_ds_group,Dverbal_Discovery$sex)
##               
##                  F   M
##   RRB_over_SC   29 130
##   SC_equal_RRB 134 442
##   SC_over_RRB   34 120
cs_res = chisq.test(Dverbal_Discovery$z_ds_group,Dverbal_Discovery$sex)
cs_res
## 
##  Pearson's Chi-squared test
## 
## data:  Dverbal_Discovery$z_ds_group and Dverbal_Discovery$sex
## X-squared = 1.8247, df = 2, p-value = 0.4016
# Replication
table(Dverbal_Replication$z_ds_group,Dverbal_Replication$sex)
##               
##                  F   M
##   RRB_over_SC   26 126
##   SC_equal_RRB 137 460
##   SC_over_RRB   33 108
cs_res = chisq.test(Dverbal_Replication$z_ds_group,Dverbal_Replication$sex)
cs_res
## 
##  Pearson's Chi-squared test
## 
## data:  Dverbal_Replication$z_ds_group and Dverbal_Replication$sex
## X-squared = 2.5948, df = 2, p-value = 0.2732
#------------------------------------------------------------------------------
# Descriptive stats

# Discovery
df2use = Dverbal_Discovery[,c("subjectkey","dataset_id","collection_id","interview_age","sex","z_ds_group","ados_age","ados_sa_css","ados_rrb_css","iq","dbaes_atotal","dbaes_btotal")]
df2use$age = df2use$interview_age/12

cols2use = c("z_ds_group","sex","age","ados_age","ados_sa_css","ados_rrb_css","iq","dbaes_atotal","dbaes_btotal")
res=describeBy(x = df2use[,cols2use], group = "z_ds_group")
res
## 
##  Descriptive statistics by group 
## group: RRB_over_SC
##              vars   n   mean    sd median trimmed   mad   min    max  range
## z_ds_group*     1 159   1.00  0.00   1.00    1.00  0.00  1.00   1.00   0.00
## sex*            2 159    NaN    NA     NA     NaN    NA   Inf   -Inf   -Inf
## age             3 159   9.95  3.96   9.50    9.70  3.46  2.00  22.58  20.58
## ados_age        4  22  91.14 38.67  85.00   88.61 37.06 37.00 171.00 134.00
## ados_sa_css     5  22   6.18  2.48   6.50    6.22  3.71  2.00  10.00   8.00
## ados_rrb_css    6  22   7.45  2.22   8.00    7.72  1.48  1.00  10.00   9.00
## iq              7  43 103.00 14.43 102.00  103.46 16.31 67.00 139.00  72.00
## dbaes_atotal    8 159   0.21  0.11   0.21    0.21  0.12  0.01   0.45   0.44
## dbaes_btotal    9 159   0.49  0.12   0.48    0.48  0.14  0.25   0.79   0.54
##               skew kurtosis   se
## z_ds_group*    NaN      NaN 0.00
## sex*            NA       NA   NA
## age           0.63     0.23 0.31
## ados_age      0.53    -0.80 8.24
## ados_sa_css  -0.09    -1.26 0.53
## ados_rrb_css -1.13     0.96 0.47
## iq           -0.21     0.11 2.20
## dbaes_atotal  0.14    -0.81 0.01
## dbaes_btotal  0.22    -0.68 0.01
## ------------------------------------------------------------ 
## group: SC_equal_RRB
##              vars   n   mean    sd median trimmed   mad min    max  range  skew
## z_ds_group*     1 576   2.00  0.00   2.00    2.00  0.00   2   2.00   0.00   NaN
## sex*            2 576    NaN    NA     NA     NaN    NA Inf   -Inf   -Inf    NA
## age             3 576   9.07  5.41   8.08    8.32  4.69   0  45.75  45.75  1.97
## ados_age        4  93  82.66 44.56  73.00   77.91 50.41  27 202.00 175.00  0.73
## ados_sa_css     5  93   6.91  2.03   7.00    7.01  1.48   1  10.00   9.00 -0.40
## ados_rrb_css    6  93   7.73  2.23   8.00    8.11  1.48   1  10.00   9.00 -1.64
## iq              7 136 104.38 18.44 107.00  105.80 17.05  42 138.00  96.00 -0.85
## dbaes_atotal    8 576   0.30  0.14   0.30    0.30  0.14   0   0.70   0.70  0.02
## dbaes_btotal    9 576   0.32  0.14   0.32    0.32  0.14   0   0.68   0.68 -0.07
##              kurtosis   se
## z_ds_group*       NaN 0.00
## sex*               NA   NA
## age              6.72 0.23
## ados_age        -0.61 4.62
## ados_sa_css     -0.17 0.21
## ados_rrb_css     2.69 0.23
## iq               1.03 1.58
## dbaes_atotal    -0.31 0.01
## dbaes_btotal    -0.30 0.01
## ------------------------------------------------------------ 
## group: SC_over_RRB
##              vars   n   mean    sd median trimmed   mad   min    max  range
## z_ds_group*     1 154   3.00  0.00   3.00    3.00  0.00  3.00   3.00   0.00
## sex*            2 154    NaN    NA     NA     NaN    NA   Inf   -Inf   -Inf
## age             3 154   7.28  5.53   5.58    6.20  2.84  1.67  37.33  35.67
## ados_age        4  38  66.37 33.78  63.00   60.75 25.20 30.00 172.00 142.00
## ados_sa_css     5  38   7.45  1.54   7.00    7.47  1.48  4.00  10.00   6.00
## ados_rrb_css    6  38   8.16  1.79   8.00    8.38  1.48  1.00  10.00   9.00
## iq              7  20 102.80 22.13 106.50  104.19 20.02 40.00 140.00 100.00
## dbaes_atotal    8 154   0.50  0.13   0.49    0.50  0.13  0.20   0.87   0.67
## dbaes_btotal    9 154   0.22  0.10   0.22    0.21  0.09  0.00   0.47   0.47
##               skew kurtosis   se
## z_ds_group*    NaN      NaN 0.00
## sex*            NA       NA   NA
## age           2.47     7.39 0.45
## ados_age      1.68     2.63 5.48
## ados_sa_css  -0.10    -0.73 0.25
## ados_rrb_css -1.63     4.33 0.29
## iq           -0.84     0.96 4.95
## dbaes_atotal  0.31    -0.25 0.01
## dbaes_btotal  0.17    -0.16 0.01
# Replication
df2use = Dverbal_Replication[,c("subjectkey","dataset_id","collection_id","interview_age","sex","z_ds_group","ados_age","ados_sa_css","ados_rrb_css","iq","dbaes_atotal","dbaes_btotal")]
df2use$age = df2use$interview_age/12

cols2use = c("z_ds_group","sex","age","ados_age","ados_sa_css","ados_rrb_css","iq","dbaes_atotal","dbaes_btotal")
res=describeBy(x = df2use[,cols2use], group = "z_ds_group")
res
## 
##  Descriptive statistics by group 
## group: RRB_over_SC
##              vars   n   mean    sd median trimmed   mad   min    max  range
## z_ds_group*     1 152   1.00  0.00   1.00    1.00  0.00  1.00   1.00   0.00
## sex*            2 152    NaN    NA     NA     NaN    NA   Inf   -Inf   -Inf
## age             3 152   9.92  4.57   9.54    9.49  4.14  3.17  28.58  25.42
## ados_age        4  11 107.00 50.26 110.00  105.78 51.89 36.00 189.00 153.00
## ados_sa_css     5  11   6.73  2.05   7.00    6.89  1.48  3.00   9.00   6.00
## ados_rrb_css    6  11   6.55  1.04   7.00    6.56  0.00  5.00   8.00   3.00
## iq              7  45 104.87 17.11 104.00  104.38 16.31 58.00 152.00  94.00
## dbaes_atotal    8 152   0.21  0.10   0.21    0.21  0.09  0.01   0.61   0.60
## dbaes_btotal    9 152   0.51  0.12   0.49    0.50  0.11  0.24   0.93   0.69
##               skew kurtosis    se
## z_ds_group*    NaN      NaN  0.00
## sex*            NA       NA    NA
## age           1.17     2.32  0.37
## ados_age      0.07    -1.44 15.15
## ados_sa_css  -0.80    -0.73  0.62
## ados_rrb_css -0.60    -1.27  0.31
## iq            0.16     0.97  2.55
## dbaes_atotal  0.82     1.43  0.01
## dbaes_btotal  0.64     0.82  0.01
## ------------------------------------------------------------ 
## group: SC_equal_RRB
##              vars   n   mean    sd median trimmed   mad min    max  range  skew
## z_ds_group*     1 597   2.00  0.00   2.00    2.00  0.00   2   2.00   0.00   NaN
## sex*            2 597    NaN    NA     NA     NaN    NA Inf   -Inf   -Inf    NA
## age             3 597   8.98  5.45   8.00    8.17  4.57   0  40.92  40.92  1.79
## ados_age        4 113  78.96 37.85  71.00   74.65 40.03  35 196.00 161.00  0.88
## ados_sa_css     5 113   6.90  2.08   7.00    6.95  2.97   2  10.00   8.00 -0.03
## ados_rrb_css    6 113   7.42  2.33   8.00    7.75  1.48   1  10.00   9.00 -1.34
## iq              7 124 106.51 17.41 108.00  106.73 16.31  57 146.00  89.00 -0.22
## dbaes_atotal    8 597   0.31  0.14   0.31    0.31  0.14   0   0.78   0.78  0.03
## dbaes_btotal    9 597   0.33  0.14   0.33    0.33  0.14   0   0.81   0.81  0.08
##              kurtosis   se
## z_ds_group*       NaN 0.00
## sex*               NA   NA
## age              4.63 0.22
## ados_age         0.14 3.56
## ados_sa_css     -0.89 0.20
## ados_rrb_css     1.46 0.22
## iq               0.19 1.56
## dbaes_atotal    -0.16 0.01
## dbaes_btotal     0.09 0.01
## ------------------------------------------------------------ 
## group: SC_over_RRB
##              vars   n   mean    sd median trimmed   mad   min    max  range
## z_ds_group*     1 141   3.00  0.00   3.00    3.00  0.00  3.00   3.00   0.00
## sex*            2 141    NaN    NA     NA     NaN    NA   Inf   -Inf   -Inf
## age             3 141   7.43  5.60   5.83    6.30  3.09  2.08  30.92  28.83
## ados_age        4  29  67.86 26.37  60.00   65.16 20.76 30.00 141.00 111.00
## ados_sa_css     5  29   7.07  1.69   7.00    7.08  1.48  3.00  10.00   7.00
## ados_rrb_css    6  29   7.48  2.46   8.00    7.80  2.97  1.00  10.00   9.00
## iq              7  17 110.35 19.14 118.00  112.00 14.83 62.00 134.00  72.00
## dbaes_atotal    8 141   0.51  0.13   0.51    0.51  0.14  0.25   0.96   0.71
## dbaes_btotal    9 141   0.21  0.11   0.21    0.21  0.12  0.00   0.50   0.50
##               skew kurtosis   se
## z_ds_group*    NaN      NaN 0.00
## sex*            NA       NA   NA
## age           2.15     4.85 0.47
## ados_age      1.08     0.74 4.90
## ados_sa_css  -0.10    -0.62 0.31
## ados_rrb_css -1.13     0.77 0.46
## iq           -0.96     0.11 4.64
## dbaes_atotal  0.30     0.15 0.01
## dbaes_btotal  0.18    -0.52 0.01
#------------------------------------------------------------------------------
# Tests of differences in interview age across the subtypes

# Discovery
mod2use = lm(formula = interview_age ~ z_ds_group, data = Dverbal_Discovery)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: interview_age
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## z_ds_group   2   85832   42916  11.001 1.908e-05 ***
## Residuals  886 3456234    3901                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Replication
mod2use = lm(formula = interview_age ~ z_ds_group, data = Dverbal_Replication)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: interview_age
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## z_ds_group   2   66863   33431  8.1544 0.0003096 ***
## Residuals  887 3636501    4100                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#------------------------------------------------------------------------------
# Tests of differences in SC across the subtypes

# Discovery
mod2use = lm(formula = dbaes_atotal ~ z_ds_group, data = Dverbal_Discovery)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: dbaes_atotal
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## z_ds_group   2  7.2145  3.6072  215.13 < 2.2e-16 ***
## Residuals  886 14.8563  0.0168                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Replication
mod2use = lm(formula = dbaes_atotal ~ z_ds_group, data = Dverbal_Replication)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: dbaes_atotal
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## z_ds_group   2  6.8767  3.4383  195.02 < 2.2e-16 ***
## Residuals  887 15.6387  0.0176                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#------------------------------------------------------------------------------
# Tests of differences in RRB across the subtypes

# Discovery
mod2use = lm(formula = dbaes_btotal ~ z_ds_group, data = Dverbal_Discovery)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: dbaes_btotal
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## z_ds_group   2  6.0621  3.0310  188.29 < 2.2e-16 ***
## Residuals  886 14.2628  0.0161                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Replication
mod2use = lm(formula = dbaes_btotal ~ z_ds_group, data = Dverbal_Replication)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: dbaes_btotal
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## z_ds_group   2  6.7172  3.3586  190.83 < 2.2e-16 ***
## Residuals  887 15.6114  0.0176                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#------------------------------------------------------------------------------
# Tests of differences in ADOS SA CSS across the subtypes

# Discovery
mod2use = lm(formula = ados_sa_css ~ z_ds_group, data = Dverbal_Discovery)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: ados_sa_css
##             Df Sum Sq Mean Sq F value  Pr(>F)  
## z_ds_group   2  22.49 11.2456  2.8304 0.06215 .
## Residuals  150 595.98  3.9732                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Replication
mod2use = lm(formula = ados_sa_css ~ z_ds_group, data = Dverbal_Replication)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: ados_sa_css
##             Df Sum Sq Mean Sq F value Pr(>F)
## z_ds_group   2   1.09  0.5429  0.1344 0.8744
## Residuals  150 605.97  4.0398
#------------------------------------------------------------------------------
# Tests of differences in ADOS RRB CSS across the subtypes

# Discovery
mod2use = lm(formula = ados_rrb_css ~ z_ds_group, data = Dverbal_Discovery)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: ados_rrb_css
##             Df Sum Sq Mean Sq F value Pr(>F)
## z_ds_group   2   7.93  3.9661  0.8739 0.4194
## Residuals  150 680.79  4.5386
# Replication
mod2use = lm(formula = ados_rrb_css ~ z_ds_group, data = Dverbal_Replication)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: ados_rrb_css
##             Df Sum Sq Mean Sq F value Pr(>F)
## z_ds_group   2   8.08  4.0416   0.768 0.4658
## Residuals  150 789.42  5.2628
#------------------------------------------------------------------------------
# Tests of differences in IQ across the subtypes

# Discovery
mod2use = lm(formula = iq ~ z_ds_group, data = Dverbal_Discovery)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: iq
##             Df Sum Sq Mean Sq F value Pr(>F)
## z_ds_group   2     90   44.82  0.1374 0.8717
## Residuals  196  63939  326.22
# Replication
mod2use = lm(formula = iq ~ z_ds_group, data = Dverbal_Replication)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: iq
##             Df Sum Sq Mean Sq F value Pr(>F)
## z_ds_group   2    372  186.08  0.6077 0.5457
## Residuals  183  56034  306.20
#------------------------------------------------------------------------------
# Make scatterplots with difference score Z subtypes in different colors
maxScores = c(3,4)

p_disc = ggplot(data = Dverbal_Discovery, aes(x = dbaes_atotal, y = dbaes_btotal, colour = z_ds_group)) + geom_point() + xlab("SC") + ylab("RRB") + ylim(0,1) + xlim(0,1) + ggtitle("NDAR Discovery")
p1_top_left = p_disc + guides(colour=FALSE)
ggsave(filename = file.path(plotpath, sprintf("final_NDAR_Disc_scatterplot_z%s.pdf",as.character(z_thresh))), plot = p1_top_left)
p_disc

table(Dverbal_Discovery$z_ds_group)
## 
##  RRB_over_SC SC_equal_RRB  SC_over_RRB 
##          159          576          154
cor_res = cor.test(Dverbal_Discovery$dbaes_atotal, Dverbal_Discovery$dbaes_btotal)
cor_res
## 
##  Pearson's product-moment correlation
## 
## data:  Dverbal_Discovery$dbaes_atotal and Dverbal_Discovery$dbaes_btotal
## t = 6.6829, df = 887, p-value = 4.134e-11
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1554332 0.2806578
## sample estimates:
##       cor 
## 0.2189469
p_rep = ggplot(data = Dverbal_Replication, aes(x = dbaes_atotal, y = dbaes_btotal, colour = z_ds_group)) + geom_point() + xlab("SC") + ylab("RRB") + ylim(0,1) + xlim(0,1) +  ggtitle("NDAR Replication")
p2_bottom_left = p_rep + guides(colour=FALSE)
ggsave(filename = file.path(plotpath, sprintf("final_NDAR_Rep_scatterplot_z%s.pdf",as.character(z_thresh))), plot = p2_bottom_left)
p_rep

table(Dverbal_Replication$z_ds_group)
## 
##  RRB_over_SC SC_equal_RRB  SC_over_RRB 
##          152          597          141
cor_res = cor.test(Dverbal_Replication$dbaes_atotal, Dverbal_Replication$dbaes_btotal)
cor_res
## 
##  Pearson's product-moment correlation
## 
## data:  Dverbal_Replication$dbaes_atotal and Dverbal_Replication$dbaes_btotal
## t = 7.1459, df = 888, p-value = 1.864e-12
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1700824 0.2943934
## sample estimates:
##       cor 
## 0.2331903
#------------------------------------------------------------------------------
# Run supervised model with Discovery as training and Replication as Test

# run validation
# make subtypes using z-scores computed from the mean and sd of the training set
train_data = Dverbal_Discovery
test_data = Dverbal_Replication

mean2use = mean(train_data[,vars2use[1]] - train_data[,vars2use[2]])
sd2use = sd(train_data[,vars2use[1]] - train_data[,vars2use[2]])
tmp_train = make_subtype(data2use = train_data,
                         z_thresh = z_thresh,
                         mean2use = mean2use,
                         sd2use = sd2use)

mean2use = mean(test_data[,vars2use[1]] - test_data[,vars2use[2]])
sd2use = sd(test_data[,vars2use[1]] - test_data[,vars2use[2]])
tmp_test = make_subtype(data2use = test_data,
                        z_thresh = z_thresh,
                        mean2use = mean2use,
                        sd2use = sd2use)

#==============================================================================
#==============================================================================
# make labels based on train mean and sd
mean2use = mean(train_data[,vars2use[1]] - train_data[,vars2use[2]])
sd2use = sd(train_data[,vars2use[1]] - train_data[,vars2use[2]])
train_mean = mean2use
train_sd = sd2use

pred_labels = make_subtype(data2use = test_data,
                        z_thresh = z_thresh,
                        mean2use = train_mean,
                        sd2use = train_sd)
confmat = table(tmp_test$z_ds_group,pred_labels$z_ds_group)
acc = (confmat[1,1]+confmat[2,2]+confmat[3,3])/dim(pred_labels)[1]

# # compute model
# mod2use = svm(x = tmp_train[,vars2use], y = tmp_train$z_ds_group)
# pred_labels = predict(mod2use, tmp_test[,vars2use])
# confmat = table(tmp_test$z_ds_group,pred_labels)
# acc = (confmat[1,1]+confmat[2,2]+confmat[3,3])/length(pred_labels)
#==============================================================================
#==============================================================================

# plot confusion matrix
setHook("grid.newpage", function() pushViewport(viewport(x=1,y=1,width=0.9, height=0.9, name="vp", just=c("right","top"))), action="prepend")
pheatmap(confmat/rowSums(confmat)*100, display_numbers = confmat, color = colorRampPalette(c('white','red'))(100), cluster_rows = FALSE, cluster_cols = FALSE, fontsize_number = fontSize, fontsize_row = fontSize, fontsize_col = fontSize,labels_row = c("RRB>SC","SC=RRB","SC>RRB"),labels_col = c("RRB>SC","SC=RRB","SC>RRB"),angle_col=90,
         breaks= seq(0,100, length=100))
setHook("grid.newpage", NULL, "replace")
grid::grid.text("Actual Labels", y=-0.07, gp=gpar(fontsize=fontSize))
grid::grid.text("Predicted Labels", x=-0.07, rot=90, gp=gpar(fontsize=fontSize))

# show accuracy
true_accuracy = acc
true_accuracy
## [1] 0.9820225
#================================================================================
#================================================================================
# #------------------------------------------------------------------------------
# # Permute subtype labels to examine how well supervised model performs
#
# # nperm = 10000
#
# # make subtypes using z-scores computed from the mean and sd of the training set
# train_data = Dverbal_Discovery
# test_data = Dverbal_Replication
# mean2use = mean(train_data[,vars2use[1]] - train_data[,vars2use[2]])
# sd2use = sd(train_data[,vars2use[1]] - train_data[,vars2use[2]])
# tmp_train = make_subtype(data2use = train_data,
#                          z_thresh = z_thresh,
#                          mean2use = mean2use,
#                          sd2use = sd2use)
#
# mean2use = mean(test_data[,vars2use[1]] - test_data[,vars2use[2]])
# sd2use = sd(test_data[,vars2use[1]] - test_data[,vars2use[2]])
# tmp_test = make_subtype(data2use = test_data,
#                         z_thresh = z_thresh,
#                         mean2use = mean2use,
#                         sd2use = sd2use)
#
# acc = vector(length = nperm)
# for (iperm in 1:nperm){
#   # set seed for reproducibility
#   set.seed(iperm)
#
#   sc_perm = sample(train_data[,vars2use[1]])
#   rrb_perm = sample(train_data[,vars2use[2]])
#   perm_mean2use = mean(sc_perm - rrb_perm)
#   perm_sd2use = sd(sc_perm - rrb_perm)
#   # perm_mean2use = mean(train_data[,vars2use[1]] - rrb_perm)
#   # perm_sd2use = sd(train_data[,vars2use[1]] - rrb_perm)
#   pred_labels = make_subtype(data2use = tmp_test,
#                         z_thresh = z_thresh,
#                         mean2use = perm_mean2use,
#                         sd2use = perm_sd2use)
#   confmat = table(tmp_test$z_ds_group,pred_labels$z_ds_group)
#   acc = (confmat[1,1]+confmat[2,2]+confmat[3,3])/dim(pred_labels)[1]
#
#   # compute model
#   permuted_labels = sample(tmp_train$z_ds_group)
#   mod2use = svm(x = tmp_train[,vars2use], y = permuted_labels)
#   pred_labels = predict(mod2use, tmp_test[,vars2use])
#   confmat = table(tmp_test$z_ds_group,pred_labels)
#   acc[iperm] = (confmat[1,1]+confmat[2,2]+confmat[3,3])/length(pred_labels)
#   #============================================================================
# } # for (iperm in 1:nperm)
#
# df2plot = data.frame(Accuracy = acc)
# p = ggplot(data = df2plot, aes(x = Accuracy)) + geom_histogram() + geom_vline(xintercept=true_accuracy)
# p
#
# # compute p-value
# pval = sum(c(true_accuracy,acc)>=true_accuracy)/(nperm+1)
# pval
#================================================================================
#================================================================================

#------------------------------------------------------------------------------
# Plot difference score Z subtypes in Discovery set
maxScores = c(3,4)

# Discovery - make plot with all individuals shown as lines
df2use = Dverbal_Discovery[,c("subjectkey",adi_total_vars2use)]
df2use$subgrp = factor(tmp_train$z_ds_group)
df2use = data.frame(df2use)
df2use$subjectkey = factor(df2use$subjectkey)
df2use$SC = df2use$dbaes_atotal
df2use$RRB = df2use$dbaes_btotal

df4plot = melt(df2use,
               id.vars = c("subjectkey","subgrp"),
               measure.vars = c("SC","RRB"))

p = ggplot(data = df4plot, aes(x = variable,
                               y = value,
                               colour = subgrp,
                               group = subjectkey)) + facet_grid(. ~ subgrp)
p = p + geom_point(shape=1) + geom_line(alpha = 0.2) + ylim(0,1) + guides(color=FALSE)
p = p + ylab("Percent Severity") + xlab("ADI-R subscale")
p3_middle_top = p + guides(colour=FALSE)
ggsave(filename = file.path(plotpath, sprintf("final_NDAR_Disc_jitterplot_z%s.pdf",as.character(z_thresh))), plot = p3_middle_top)
p

# Plot difference score Z subtypes in Replication set
maxScores = c(3,4)

# Replication - make plot with all individuals shown as lines
df2use = Dverbal_Replication[,c("subjectkey",adi_total_vars2use)]
df2use$subgrp = factor(tmp_test$z_ds_group)
df2use = data.frame(df2use)
df2use$subjectkey = factor(df2use$subjectkey)
df2use$SC = df2use$dbaes_atotal
df2use$RRB = df2use$dbaes_btotal

df4plot = melt(df2use,
               id.vars = c("subjectkey","subgrp"),
               measure.vars = c("SC","RRB"))

p = ggplot(data = df4plot, aes(x = variable,
                               y = value,
                               colour = subgrp,
                               group = subjectkey)) + facet_grid(. ~ subgrp)
p = p + geom_point(shape=1) + geom_line(alpha = 0.2) + ylim(0,1) + guides(color=FALSE)
p = p + ylab("Percent Severity") + xlab("ADI-R subscale")
p4_middle_bottom = p + guides(colour=FALSE)
ggsave(filename = file.path(plotpath, sprintf("final_NDAR_Rep_jitterplot_z%s.pdf",as.character(z_thresh))), plot = p4_middle_bottom)
p

#------------------------------------------------------------------------------
# Apply NDAR subtypes to EU-AIMS LEAP data

# make SC and RRB percentages since EU-AIMS data is specified as percentages
Dverbal = read.csv(file.path(datapath,"tidy_verbal.csv"))
euaims_data = read.csv(file.path(datapath,"tidy_euaims.csv"))
mask1 = euaims_data$Diagnosis=="ASD"
mask2 =  (is.na(euaims_data$A1_pct_severity) | is.na(euaims_data$A2_pct_severity) | is.na(euaims_data$A3_pct_severity) | is.na(euaims_data$B1_pct_severity) | is.na(euaims_data$B2_pct_severity) | is.na(euaims_data$B3_pct_severity) | is.na(euaims_data$B4_pct_severity))
euaims_data = subset(euaims_data, (mask1 & !mask2))

Dverbal[,vars2use[1]] = Dverbal[,vars2use[1]]
Dverbal[,vars2use[2]] = Dverbal[,vars2use[2]]
euaims_data[,vars2use[1]] = (euaims_data$A1_pct_severity +
                               euaims_data$A2_pct_severity +
                               euaims_data$A3_pct_severity)/3
euaims_data[,vars2use[2]] = (euaims_data$B1_pct_severity +
                               euaims_data$B2_pct_severity +
                               euaims_data$B3_pct_severity +
                               euaims_data$B4_pct_severity)/4

train_data = Dverbal
test_data = euaims_data

mean2use = mean(train_data[,vars2use[1]] - train_data[,vars2use[2]], na.rm=TRUE)
sd2use = sd(train_data[,vars2use[1]] - train_data[,vars2use[2]], na.rm=TRUE)
c(mean2use, sd2use)
## [1] -0.01045243  0.19482749
tmp_train = make_subtype(data2use = train_data,
                         z_thresh = z_thresh,
                         mean2use = mean2use,
                         sd2use = sd2use)

tmp_test = make_subtype(data2use = test_data,
                        z_thresh = z_thresh,
                        mean2use = mean2use,
                        sd2use = sd2use)

#===========================================================================
#===========================================================================
# # compute model
# mod2use = svm(x = tmp_train[,vars2use], y = tmp_train$z_ds_group)
# pred_labels = predict(mod2use, tmp_test[,vars2use])
# confmat = table(tmp_test$z_ds_group,pred_labels)
# acc = (confmat[1,1]+confmat[2,2]+confmat[3,3])/length(pred_labels)
#
# tmp_test$svm_pred_labels = pred_labels
#
# # plot confusion matrix
# setHook("grid.newpage", function() pushViewport(viewport(x=1,y=1,width=0.9, height=0.9, name="vp", just=c("right","top"))), action="prepend")
# pheatmap(confmat/rowSums(confmat)*100, display_numbers = confmat, color = colorRampPalette(c('white','red'))(100), cluster_rows = FALSE, cluster_cols = FALSE, fontsize_number = fontSize, fontsize_row = fontSize, fontsize_col = fontSize,labels_row = c("RRB>SC","SC=RRB","SC>RRB"),labels_col = c("RRB>SC","SC=RRB","SC>RRB"),angle_col=90,
#          breaks= seq(0,100, length=100))
# setHook("grid.newpage", NULL, "replace")
# grid::grid.text("Actual Labels", y=-0.07, gp=gpar(fontsize=fontSize))
# grid::grid.text("Predicted Labels", x=-0.07, rot=90, gp=gpar(fontsize=fontSize))
#===========================================================================
#===========================================================================

# scatterplot
p1 = ggplot(data = tmp_train, aes(x = dbaes_atotal, y = dbaes_btotal, colour = factor(z_ds_group))) + geom_point() + xlab("Social-Communication") + ylab("Restricted Repetitive Behaviors") + ylim(0,0.8) + xlim(0,0.8) + ggtitle("NDAR ALL")
p1

p2 = ggplot(data = tmp_test, aes(x = dbaes_atotal, y = dbaes_btotal, colour = factor(z_ds_group))) + geom_point() + xlab("Social-Communication") + ylab("Restricted Repetitive Behaviors") + ylim(0,0.8) + xlim(0,0.8) + ggtitle("EU-AIMS with Groups from NDAR ALL")
p2

#===========================================================================
#===========================================================================
# p3 = ggplot(data = tmp_test, aes(x = dbaes_atotal, y = dbaes_btotal, colour = factor(pred_labels))) + geom_point() + xlab("SC") + ylab("RRB") + ylim(0,0.8) + xlim(0,0.8) + ggtitle("EU-AIMS")
# p5_bottom_right = p3 + guides(colour=FALSE)
# ggsave(filename = file.path(plotpath, sprintf("final_EUAIMS_scatterplot_z%s.pdf",as.character(z_thresh))), plot = p5_bottom_right)
# p3
#===========================================================================
#===========================================================================

# # write out EU-AIMS LEAP data with subgroups defined by NDAR All
write.csv(tmp_test, file.path(datapath, sprintf("tidy_euaims_NDAR_subtypes_diffscore_z%s.csv",as.character(z_thresh))))

#===========================================================================
#===========================================================================
# # Make final plot
# p_final = p1_top_left + p3_middle_top + plot_spacer() + p2_bottom_left + p4_middle_bottom + p5_bottom_right + plot_layout(nrow=3, ncol=3, widths = c(4,4,4), heights = c(8,8,8))
# ggsave(filename = file.path(plotpath, sprintf("final_NDAR_EUAIMS_subtypes_plot_z%s.pdf",as.character(z_thresh))), plot = p_final)
# p_final
#===========================================================================
#===========================================================================

#------------------------------------------------------------------------------
# Integrate subtypes with rest of EU-AIMS LEAP data
euaims_data = read.csv(file.path(datapath,"tidy_euaims.csv"))
td_df = subset(euaims_data, euaims_data$Diagnosis=="TD",select=2:6)
td_df$A_pct_severity = as.numeric(NA)
td_df$B_pct_severity = as.numeric(NA)
td_df$subgrp = "TD"

#===========================================================================
#===========================================================================
# cols2use = c("subid","age","Centre","Schedule","Diagnosis","dbaes_atotal","dbaes_btotal","svm_pred_labels")
cols2use = c("subid","age","Centre","Schedule","Diagnosis","dbaes_atotal","dbaes_btotal","z_ds_group")
#===========================================================================
#===========================================================================
tmp_asd = tmp_test[,cols2use]
colnames(tmp_asd)[6] = "A_pct_severity"
colnames(tmp_asd)[7] = "B_pct_severity"
colnames(tmp_asd)[8] = "subgrp"
asd_df = tmp_asd

all_data = rbind(td_df,asd_df)

fname = "/Users/mlombardo/Dropbox/euaims/data/rsfmri_preproc/euaims_preproc.xlsx"
pp_data = read_excel(fname)
mask = pp_data$notes=="ok"
pp_data = subset(pp_data, mask)

asd_df = merge(pp_data[,c("subid","sex")],asd_df, by = "subid")
td_df = merge(pp_data[,c("subid","sex")],td_df, by = "subid")

all_data = rbind(td_df,asd_df)

data2write = merge(pp_data, all_data, by = "subid")
data2write$age = data2write$age.x
data2write$sex = data2write$sex.y
print(table(data2write$Schedule, data2write$subgrp))
##    
##     RRB_over_SC SC_equal_RRB SC_over_RRB TD
##   A           5           43          32 78
##   B           1           52          32 83
##   C           4           35          24 59
##   D           0           18          20 23
print(table(data2write$Centre, data2write$subgrp))
##                
##                 RRB_over_SC SC_equal_RRB SC_over_RRB TD
##   CAMBRIDGE               4           31           7 29
##   KINGS_COLLEGE           5           56          45 78
##   MANNHEIM                0            0           0 34
##   NIJMEGEN                1           49          43 64
##   UTRECHT                 0           12          13 38
print(table(data2write$sex, data2write$subgrp))
##         
##          RRB_over_SC SC_equal_RRB SC_over_RRB  TD
##   Female           5           39          27  88
##   Male             5          109          81 155
#DSM-5 - find best split that balances participants across sites
a = findBestSplit(asd_df, seed_range = c(172342)) #,300001:500000))
## [1] 172342
best_seeds = a$seed[!is.na(a$discrepancy) & a$discrepancy==min(a$discrepancy, na.rm = TRUE)]
print(best_seeds)
## [1] 172342
# Split datasets -------------------------------------------------------------
rngSeed = best_seeds[1]

# split Schedule A dataset
dset2use = subset(asd_df, asd_df$Schedule=="A")
tmp_d = SplitDatasetsBySex(dset2use, rngSeed = rngSeed)
A_Discovery = tmp_d[[2]]
A_Replication = tmp_d[[1]]

# split Schedule B dataset
dset2use = subset(asd_df, asd_df$Schedule=="B")
tmp_d = SplitDatasetsBySex(dset2use, rngSeed = rngSeed)
B_Discovery = tmp_d[[2]]
B_Replication = tmp_d[[1]]

# split Schedule C dataset
dset2use = subset(asd_df, asd_df$Schedule=="C")
tmp_d = SplitDatasetsBySex(dset2use, rngSeed = rngSeed)
C_Discovery = tmp_d[[2]]
C_Replication = tmp_d[[1]]

# split Schedule D dataset
dset2use = subset(asd_df, asd_df$Schedule=="D")
tmp_d = SplitDatasetsBySex(dset2use, rngSeed = rngSeed)
D_Discovery = tmp_d[[2]]
D_Replication = tmp_d[[1]]

df_Disc = rbind(A_Discovery, B_Discovery, C_Discovery, D_Discovery)
df_Rep = rbind(A_Replication, B_Replication, C_Replication, D_Replication)

a = table(df_Disc$Schedule, df_Disc$Centre); print(a)
##    
##     CAMBRIDGE KINGS_COLLEGE NIJMEGEN UTRECHT
##   A         6            17       10       7
##   B         9            16       14       3
##   C         6             9       14       2
##   D         0            10       10       0
b = table(df_Rep$Schedule, df_Rep$Centre); print(b)
##    
##     CAMBRIDGE KINGS_COLLEGE NIJMEGEN UTRECHT
##   A         7            18       10       5
##   B         8            17       13       5
##   C         6            10       13       3
##   D         0             9        9       0
print(a-b)
##    
##     CAMBRIDGE KINGS_COLLEGE NIJMEGEN UTRECHT
##   A        -1            -1        0       2
##   B         1            -1        1      -2
##   C         0            -1        1      -1
##   D         0             1        1       0
print(sum(rowSums(abs(a-b))))
## [1] 14
data2write$dataset = NA
mask  = is.element(data2write$subid,df_Disc$subid)
data2write[mask,"dataset"] = "Discovery"
mask  = is.element(data2write$subid,df_Rep$subid)
data2write[mask,"dataset"] = "Replication"

asd_Disc = subset(data2write, data2write$dataset=="Discovery" & data2write$Diagnosis=="ASD")
asd_Rep = subset(data2write, data2write$dataset=="Replication" & data2write$Diagnosis=="ASD")

# # find which seed gives best TD age-match -------------------------------------
# seeds = 1:1000
# pvals = data.frame(matrix(nrow = length(seeds), ncol = 2))
# for (i in 1:length(seeds)) {
#   res = findTDAgeMatch(data2write, seed_range = c(seeds[i],seeds[i]))
#   td_Disc_matched = res[[2]]
#   td_Rep_matched = res[[1]]
#   tres = t.test(td_Disc_matched$age, asd_Disc$age)
#   pvals[i,1] = tres$p.value
#   tres = t.test(td_Rep_matched$age, asd_Rep$age)
#   pvals[i,2] = tres$p.value
#   #print(i)
# }
# a = sort.int(pvals[,1], decreasing = TRUE, index.return = TRUE)
# b=pvals[a$ix,]

seed2use = 929
res = findTDAgeMatch(data2write, seed_range = c(seed2use,seed2use))
td_Disc_matched = res[[2]]
td_Rep_matched = res[[1]]

mask = is.element(data2write$subid, td_Disc_matched$subid)
data2write$dataset[mask] = "Discovery"

mask = is.element(data2write$subid, td_Rep_matched$subid)
data2write$dataset[mask] = "Replication"

print(table(data2write$dataset, data2write$subgrp))
##              
##               RRB_over_SC SC_equal_RRB SC_over_RRB  TD
##   Discovery             8           70          55 121
##   Replication           2           78          53 122
fname2save = here(sprintf("asd_subgrp_data_rsfmri_ALL_DSM5_diffzscoreGrps_z%s.csv",as.character(z_thresh)))
write.csv(data2write,file = fname2save)

#------------------------------------------------------------------------------
# Descriptive stats
df2use = data2write[,c("subid","Diagnosis","dataset","Centre","meanFD","age","sex","subgrp","viq_all","piq_all","fsiq4_all","A_pct_severity","B_pct_severity","ADI_social_total","ADI_communication_total","ADI_RRB_total","ados_2_SA_CSS","ados_2_RRB_CSS","SRS_tscore","SRS_tscore_self","RBS_total","SSP_total","vabsdscoresc_dss","vabsdscoresd_dss","vabsdscoress_dss","vabsabcabc_standard")]
mask = df2use==999 | df2use==777
df2use[mask] = NA
df2use$age = df2use$age/365

cols2use = c("dataset","subgrp","age","meanFD","viq_all","piq_all","fsiq4_all","A_pct_severity","B_pct_severity","ADI_social_total","ADI_communication_total","ADI_RRB_total","ados_2_SA_CSS","ados_2_RRB_CSS","SRS_tscore","SRS_tscore_self","RBS_total","SSP_total","vabsdscoresc_dss","vabsdscoresd_dss","vabsdscoress_dss","vabsabcabc_standard")
res=describeBy(x = df2use[,cols2use], group = c("subgrp","dataset"))
res
## 
##  Descriptive statistics by group 
## subgrp: RRB_over_SC
## dataset: Discovery
##                         vars n   mean    sd median trimmed   mad    min    max
## dataset*                   1 8    NaN    NA     NA     NaN    NA    Inf   -Inf
## subgrp*                    2 8    NaN    NA     NA     NaN    NA    Inf   -Inf
## age                        3 8  17.53  7.05  21.73   17.53  2.56   7.89  23.88
## meanFD                     4 8   0.27  0.36   0.18    0.27  0.07   0.06   1.14
## viq_all                    5 8 103.86 13.94 107.50  103.86 12.60  78.00 123.85
## piq_all                    6 8 101.37  9.62  99.50  101.37  9.64  87.00 114.00
## fsiq4_all                  7 8 102.75 12.37 103.00  102.75 10.38  80.00 118.01
## A_pct_severity             8 8   0.20  0.11   0.19    0.20  0.13   0.05   0.35
## B_pct_severity             9 8   0.47  0.08   0.47    0.47  0.09   0.36   0.59
## ADI_social_total          10 8  17.50  7.80  20.50   17.50  7.41   5.00  26.00
## ADI_communication_total   11 8  16.25  6.30  17.00   16.25  7.41   6.00  24.00
## ADI_RRB_total             12 8   8.50  1.41   8.50    8.50  2.22   7.00  10.00
## ados_2_SA_CSS             13 8   3.75  2.87   3.00    3.75  2.22   1.00   9.00
## ados_2_RRB_CSS            14 8   3.88  3.98   1.00    3.88  0.00   1.00   9.00
## SRS_tscore                15 4  72.25 13.72  70.50   72.25 12.60  58.00  90.00
## SRS_tscore_self           16 4  59.00  7.12  61.00   59.00  4.45  49.00  65.00
## RBS_total                 17 4  18.00  8.83  17.50   18.00  8.90   8.00  29.00
## SSP_total                 18 3 142.00 24.06 140.00  142.00 31.13 119.00 167.00
## vabsdscoresc_dss          19 5  84.20 18.63  77.00   84.20 14.83  67.00 115.00
## vabsdscoresd_dss          20 5  69.00  9.46  74.00   69.00  7.41  57.00  79.00
## vabsdscoress_dss          21 5  70.40  8.99  71.00   70.40  2.97  57.00  82.00
## vabsabcabc_standard       22 5  72.80 10.76  72.00   72.80  7.41  59.00  88.00
##                         range  skew kurtosis    se
## dataset*                 -Inf    NA       NA    NA
## subgrp*                  -Inf    NA       NA    NA
## age                     16.00 -0.41    -1.98  2.49
## meanFD                   1.09  1.77     1.50  0.13
## viq_all                 45.85 -0.43    -0.93  4.93
## piq_all                 27.00  0.09    -1.56  3.40
## fsiq4_all               38.01 -0.40    -1.08  4.37
## A_pct_severity           0.30  0.01    -1.94  0.04
## B_pct_severity           0.23  0.05    -1.67  0.03
## ADI_social_total        21.00 -0.41    -1.67  2.76
## ADI_communication_total 18.00 -0.26    -1.52  2.23
## ADI_RRB_total            3.00  0.00    -2.05  0.50
## ados_2_SA_CSS            8.00  0.70    -1.16  1.01
## ados_2_RRB_CSS           8.00  0.44    -2.00  1.41
## SRS_tscore              32.00  0.24    -2.00  6.86
## SRS_tscore_self         16.00 -0.50    -1.88  3.56
## RBS_total               21.00  0.11    -1.98  4.42
## SSP_total               48.00  0.08    -2.33 13.89
## vabsdscoresc_dss        48.00  0.71    -1.35  8.33
## vabsdscoresd_dss        22.00 -0.23    -2.10  4.23
## vabsdscoress_dss        25.00 -0.23    -1.46  4.02
## vabsabcabc_standard     29.00  0.14    -1.65  4.81
## ------------------------------------------------------------ 
## subgrp: SC_equal_RRB
## dataset: Discovery
##                         vars  n   mean    sd median trimmed   mad   min    max
## dataset*                   1 70    NaN    NA     NA     NaN    NA   Inf   -Inf
## subgrp*                    2 70    NaN    NA     NA     NaN    NA   Inf   -Inf
## age                        3 70  16.17  5.67  14.92   15.91  4.95  7.08  30.28
## meanFD                     4 70   0.29  0.47   0.19    0.22  0.13  0.03   3.95
## viq_all                    5 69  96.91 18.86  97.35   96.93 21.72 61.00 136.00
## piq_all                    6 69  99.63 22.05 102.00   99.62 23.62 58.00 150.00
## fsiq4_all                  7 70  98.42 19.14 102.25   98.79 20.07 60.00 143.00
## A_pct_severity             8 70   0.31  0.14   0.31    0.31  0.15  0.00   0.63
## B_pct_severity             9 70   0.30  0.15   0.29    0.29  0.16  0.01   0.69
## ADI_social_total          10 70  16.94  6.97  18.00   17.30  8.90  2.00  27.00
## ADI_communication_total   11 70  13.73  5.95  14.00   13.79  6.67  0.00  26.00
## ADI_RRB_total             12 70   5.20  2.55   5.00    5.21  2.97  0.00  12.00
## ados_2_SA_CSS             13 69   6.30  2.62   7.00    6.40  2.97  1.00  10.00
## ados_2_RRB_CSS            14 69   5.01  2.78   5.00    5.00  2.97  1.00  10.00
## SRS_tscore                15 61  72.70 12.49  74.00   73.12 14.83 45.00  95.00
## SRS_tscore_self           16 33  63.09 13.23  64.00   62.04 13.34 43.00  94.00
## RBS_total                 17 58  19.72 16.76  17.00   17.38 14.08  0.00  90.00
## SSP_total                 18 39 132.74 31.42 136.00  133.39 34.10 53.00 189.00
## vabsdscoresc_dss          19 68  73.04 17.83  75.00   73.66 11.86 21.00 122.00
## vabsdscoresd_dss          20 67  73.16 16.69  73.00   73.00 11.86 25.00 131.00
## vabsdscoress_dss          21 68  70.47 15.99  73.00   71.54 13.34 20.00 104.00
## vabsabcabc_standard       22 67  70.84 13.34  72.00   71.13 10.38 20.00 103.00
##                          range  skew kurtosis   se
## dataset*                  -Inf    NA       NA   NA
## subgrp*                   -Inf    NA       NA   NA
## age                      23.20  0.46    -0.58 0.68
## meanFD                    3.92  6.60    47.90 0.06
## viq_all                  75.00 -0.11    -0.84 2.27
## piq_all                  92.00 -0.09    -0.62 2.65
## fsiq4_all                83.00 -0.16    -0.77 2.29
## A_pct_severity            0.63  0.00    -0.65 0.02
## B_pct_severity            0.68  0.42    -0.31 0.02
## ADI_social_total         25.00 -0.36    -1.02 0.83
## ADI_communication_total  26.00 -0.07    -0.79 0.71
## ADI_RRB_total            12.00  0.05    -0.42 0.30
## ados_2_SA_CSS             9.00 -0.39    -1.06 0.32
## ados_2_RRB_CSS            9.00 -0.31    -1.09 0.34
## SRS_tscore               50.00 -0.23    -0.93 1.60
## SRS_tscore_self          51.00  0.47    -0.53 2.30
## RBS_total                90.00  1.70     3.93 2.20
## SSP_total               136.00 -0.26    -0.47 5.03
## vabsdscoresc_dss        101.00 -0.39     1.13 2.16
## vabsdscoresd_dss        106.00  0.20     1.95 2.04
## vabsdscoress_dss         84.00 -0.75     0.61 1.94
## vabsabcabc_standard      83.00 -0.57     2.42 1.63
## ------------------------------------------------------------ 
## subgrp: SC_over_RRB
## dataset: Discovery
##                         vars  n   mean    sd median trimmed   mad   min    max
## dataset*                   1 55    NaN    NA     NA     NaN    NA   Inf   -Inf
## subgrp*                    2 55    NaN    NA     NA     NaN    NA   Inf   -Inf
## age                        3 55  16.43  5.12  15.91   16.20  5.09  7.78  29.40
## meanFD                     4 55   0.23  0.23   0.15    0.18  0.10  0.04   1.08
## viq_all                    5 54  98.07 18.80 100.00   97.54 19.26 65.55 142.00
## piq_all                    6 54  99.26 20.81 102.50  100.66 19.27 52.43 136.38
## fsiq4_all                  7 55  98.84 18.36 101.36   99.53 18.73 59.00 128.30
## A_pct_severity             8 55   0.44  0.14   0.45    0.44  0.12  0.19   0.82
## B_pct_severity             9 55   0.16  0.09   0.15    0.15  0.11  0.00   0.33
## ADI_social_total          10 55  18.02  6.38  18.00   18.40  7.41  3.00  28.00
## ADI_communication_total   11 55  14.58  4.88  15.00   14.78  4.45  2.00  24.00
## ADI_RRB_total             12 55   2.93  1.94   3.00    2.82  1.48  0.00   8.00
## ados_2_SA_CSS             13 53   6.30  2.50   7.00    6.44  2.97  1.00  10.00
## ados_2_RRB_CSS            14 53   4.66  2.79   5.00    4.56  2.97  1.00  10.00
## SRS_tscore                15 49  72.22 11.09  74.00   72.73 10.38 44.00  90.00
## SRS_tscore_self           16 23  61.87  9.45  61.00   61.21  8.90 42.00  89.00
## RBS_total                 17 48  14.04 12.97  11.50   12.50 12.60  0.00  54.00
## SSP_total                 18 37 141.51 28.32 141.00  142.26 35.58 78.00 186.00
## vabsdscoresc_dss          19 50  70.52 16.34  71.50   72.10 11.12 21.00 103.00
## vabsdscoresd_dss          20 50  69.82 16.53  70.00   69.92 11.86 17.00 112.00
## vabsdscoress_dss          21 50  67.16 15.86  68.00   68.85 14.08 20.00  95.00
## vabsabcabc_standard       22 50  66.92 15.46  70.00   68.47  9.64  6.00  96.00
##                          range  skew kurtosis   se
## dataset*                  -Inf    NA       NA   NA
## subgrp*                   -Inf    NA       NA   NA
## age                      21.62  0.42    -0.35 0.69
## meanFD                    1.04  2.29     4.80 0.03
## viq_all                  76.45  0.25    -0.44 2.56
## piq_all                  83.96 -0.56    -0.45 2.83
## fsiq4_all                69.30 -0.35    -0.94 2.48
## A_pct_severity            0.63  0.26    -0.18 0.02
## B_pct_severity            0.33  0.21    -1.03 0.01
## ADI_social_total         25.00 -0.44    -0.66 0.86
## ADI_communication_total  22.00 -0.38    -0.29 0.66
## ADI_RRB_total             8.00  0.62    -0.09 0.26
## ados_2_SA_CSS             9.00 -0.44    -0.86 0.34
## ados_2_RRB_CSS            9.00 -0.14    -1.16 0.38
## SRS_tscore               46.00 -0.43    -0.19 1.58
## SRS_tscore_self          47.00  0.68     1.21 1.97
## RBS_total                54.00  1.20     1.27 1.87
## SSP_total               108.00 -0.19    -1.09 4.66
## vabsdscoresc_dss         82.00 -1.13     2.23 2.31
## vabsdscoresd_dss         95.00 -0.33     1.48 2.34
## vabsdscoress_dss         75.00 -1.06     1.42 2.24
## vabsabcabc_standard      90.00 -1.70     4.59 2.19
## ------------------------------------------------------------ 
## subgrp: TD
## dataset: Discovery
##                         vars   n   mean    sd median trimmed   mad    min
## dataset*                   1 121    NaN    NA     NA     NaN    NA    Inf
## subgrp*                    2 121    NaN    NA     NA     NaN    NA    Inf
## age                        3 121  16.83  5.23  16.65   16.73  5.69   7.22
## meanFD                     4 121   0.18  0.15   0.13    0.15  0.07   0.03
## viq_all                    5 119 104.52 19.70 105.00  105.03 19.27  46.00
## piq_all                    6 119 106.10 19.47 107.00  107.53 17.79  49.00
## fsiq4_all                  7 119 105.72 18.33 108.18  106.99 16.58  53.00
## A_pct_severity             8   0    NaN    NA     NA     NaN    NA    Inf
## B_pct_severity             9   0    NaN    NA     NA     NaN    NA    Inf
## ADI_social_total          10   0    NaN    NA     NA     NaN    NA    Inf
## ADI_communication_total   11   0    NaN    NA     NA     NaN    NA    Inf
## ADI_RRB_total             12   0    NaN    NA     NA     NaN    NA    Inf
## ados_2_SA_CSS             13   0    NaN    NA     NA     NaN    NA    Inf
## ados_2_RRB_CSS            14   0    NaN    NA     NA     NaN    NA    Inf
## SRS_tscore                15  68  47.84  9.40  45.00   46.32  5.19  37.00
## SRS_tscore_self           16  71  46.69  4.85  46.00   46.26  4.45  39.00
## RBS_total                 17  68   2.15  4.74   0.00    0.95  0.00   0.00
## SSP_total                 18  59 177.86 12.71 182.00  179.78  8.90 122.00
## vabsdscoresc_dss          19  34  91.97 25.44  99.50   93.50 21.50  21.00
## vabsdscoresd_dss          20  34  90.74 20.28  98.50   92.25 17.05  33.00
## vabsdscoress_dss          21  34  96.21 23.67 102.50   98.57 21.50  33.00
## vabsabcabc_standard       22  34  92.06 23.04  99.50   93.89 15.57  25.00
##                            max  range  skew kurtosis   se
## dataset*                  -Inf   -Inf    NA       NA   NA
## subgrp*                   -Inf   -Inf    NA       NA   NA
## age                      29.84  22.62  0.17    -0.51 0.48
## meanFD                    0.85   0.82  2.28     5.65 0.01
## viq_all                 160.00 114.00 -0.24     0.38 1.81
## piq_all                 147.00  98.00 -0.69     0.32 1.79
## fsiq4_all               142.00  89.00 -0.69     0.58 1.68
## A_pct_severity            -Inf   -Inf    NA       NA   NA
## B_pct_severity            -Inf   -Inf    NA       NA   NA
## ADI_social_total          -Inf   -Inf    NA       NA   NA
## ADI_communication_total   -Inf   -Inf    NA       NA   NA
## ADI_RRB_total             -Inf   -Inf    NA       NA   NA
## ados_2_SA_CSS             -Inf   -Inf    NA       NA   NA
## ados_2_RRB_CSS            -Inf   -Inf    NA       NA   NA
## SRS_tscore               76.00  39.00  1.54     1.44 1.14
## SRS_tscore_self          63.00  24.00  0.93     1.10 0.58
## RBS_total                27.00  27.00  3.13    10.87 0.57
## SSP_total               190.00  68.00 -1.90     4.85 1.66
## vabsdscoresc_dss        138.00 117.00 -0.71     0.36 4.36
## vabsdscoresd_dss        121.00  88.00 -0.80     0.07 3.48
## vabsdscoress_dss        129.00  96.00 -0.83    -0.31 4.06
## vabsabcabc_standard     127.00 102.00 -0.90     0.30 3.95
## ------------------------------------------------------------ 
## subgrp: RRB_over_SC
## dataset: Replication
##                         vars n   mean    sd median trimmed   mad    min    max
## dataset*                   1 2    NaN    NA     NA     NaN    NA    Inf   -Inf
## subgrp*                    2 2    NaN    NA     NA     NaN    NA    Inf   -Inf
## age                        3 2  12.03  0.82  12.03   12.03  0.86  11.45  12.61
## meanFD                     4 2   0.29  0.13   0.29    0.29  0.13   0.20   0.38
## viq_all                    5 2 121.00 31.11 121.00  121.00 32.62  99.00 143.00
## piq_all                    6 2 118.50 41.72 118.50  118.50 43.74  89.00 148.00
## fsiq4_all                  7 2 120.50 38.89 120.50  120.50 40.77  93.00 148.00
## A_pct_severity             8 2   0.16  0.00   0.16    0.16  0.00   0.15   0.16
## B_pct_severity             9 2   0.37  0.04   0.37    0.37  0.04   0.35   0.40
## ADI_social_total          10 2  14.50  2.12  14.50   14.50  2.22  13.00  16.00
## ADI_communication_total   11 2   7.50  2.12   7.50    7.50  2.22   6.00   9.00
## ADI_RRB_total             12 2   6.50  0.71   6.50    6.50  0.74   6.00   7.00
## ados_2_SA_CSS             13 2   6.00  1.41   6.00    6.00  1.48   5.00   7.00
## ados_2_RRB_CSS            14 2   4.00  4.24   4.00    4.00  4.45   1.00   7.00
## SRS_tscore                15 2  66.00  8.49  66.00   66.00  8.90  60.00  72.00
## SRS_tscore_self           16 0    NaN    NA     NA     NaN    NA    Inf   -Inf
## RBS_total                 17 2  15.50  3.54  15.50   15.50  3.71  13.00  18.00
## SSP_total                 18 2 146.00  9.90 146.00  146.00 10.38 139.00 153.00
## vabsdscoresc_dss          19 2  86.50 17.68  86.50   86.50 18.53  74.00  99.00
## vabsdscoresd_dss          20 2  71.00  4.24  71.00   71.00  4.45  68.00  74.00
## vabsdscoress_dss          21 2  85.50 13.44  85.50   85.50 14.08  76.00  95.00
## vabsabcabc_standard       22 2  79.00  2.83  79.00   79.00  2.97  77.00  81.00
##                         range skew kurtosis    se
## dataset*                 -Inf   NA       NA    NA
## subgrp*                  -Inf   NA       NA    NA
## age                      1.16    0    -2.75  0.58
## meanFD                   0.18    0    -2.75  0.09
## viq_all                 44.00    0    -2.75 22.00
## piq_all                 59.00    0    -2.75 29.50
## fsiq4_all               55.00    0    -2.75 27.50
## A_pct_severity           0.01    0    -2.75  0.00
## B_pct_severity           0.06    0    -2.75  0.03
## ADI_social_total         3.00    0    -2.75  1.50
## ADI_communication_total  3.00    0    -2.75  1.50
## ADI_RRB_total            1.00    0    -2.75  0.50
## ados_2_SA_CSS            2.00    0    -2.75  1.00
## ados_2_RRB_CSS           6.00    0    -2.75  3.00
## SRS_tscore              12.00    0    -2.75  6.00
## SRS_tscore_self          -Inf   NA       NA    NA
## RBS_total                5.00    0    -2.75  2.50
## SSP_total               14.00    0    -2.75  7.00
## vabsdscoresc_dss        25.00    0    -2.75 12.50
## vabsdscoresd_dss         6.00    0    -2.75  3.00
## vabsdscoress_dss        19.00    0    -2.75  9.50
## vabsabcabc_standard      4.00    0    -2.75  2.00
## ------------------------------------------------------------ 
## subgrp: SC_equal_RRB
## dataset: Replication
##                         vars  n   mean    sd median trimmed   mad   min    max
## dataset*                   1 78    NaN    NA     NA     NaN    NA   Inf   -Inf
## subgrp*                    2 78    NaN    NA     NA     NaN    NA   Inf   -Inf
## age                        3 78  16.70  5.64  16.42   16.43  6.04  7.12  30.15
## meanFD                     4 78   0.25  0.28   0.17    0.19  0.10  0.05   1.60
## viq_all                    5 77 102.02 16.24 102.73  102.66 17.39 62.90 133.00
## piq_all                    6 77 104.03 18.25 106.00  105.17 19.27 52.00 134.00
## fsiq4_all                  7 77 103.31 16.26 106.00  104.03 16.98 64.00 131.00
## A_pct_severity             8 78   0.27  0.13   0.26    0.27  0.12  0.04   0.65
## B_pct_severity             9 78   0.24  0.13   0.22    0.23  0.12  0.00   0.67
## ADI_social_total          10 78  14.97  6.05  15.50   15.20  6.67  1.00  27.00
## ADI_communication_total   11 78  11.90  5.47  11.00   11.89  5.93  0.00  24.00
## ADI_RRB_total             12 78   3.90  2.24   4.00    3.83  2.22  0.00   9.00
## ados_2_SA_CSS             13 76   5.58  2.52   6.00    5.63  2.97  1.00  10.00
## ados_2_RRB_CSS            14 76   4.91  2.55   5.00    4.89  1.48  1.00  10.00
## SRS_tscore                15 71  66.54 11.62  67.00   66.33 13.34 43.00  90.00
## SRS_tscore_self           16 39  62.00  7.93  62.00   61.76  5.93 46.00  84.00
## RBS_total                 17 68  12.82 11.02  10.00   11.52  9.64  0.00  52.00
## SSP_total                 18 46 143.24 26.63 143.00  145.08 34.10 69.00 184.00
## vabsdscoresc_dss          19 71  82.11 13.83  79.00   81.18 11.86 50.00 122.00
## vabsdscoresd_dss          20 70  78.60 15.55  77.50   77.98 13.34 38.00 119.00
## vabsdscoress_dss          21 71  76.11 15.38  77.00   76.82 11.86 28.00 112.00
## vabsabcabc_standard       22 70  77.34 13.11  77.00   77.02  8.90 39.00 117.00
##                          range  skew kurtosis   se
## dataset*                  -Inf    NA       NA   NA
## subgrp*                   -Inf    NA       NA   NA
## age                      23.03  0.37    -0.54 0.64
## meanFD                    1.55  3.39    12.54 0.03
## viq_all                  70.10 -0.31    -0.77 1.85
## piq_all                  82.00 -0.59    -0.02 2.08
## fsiq4_all                67.00 -0.41    -0.54 1.85
## A_pct_severity            0.61  0.44     0.19 0.01
## B_pct_severity            0.67  0.66     0.55 0.01
## ADI_social_total         26.00 -0.31    -0.68 0.68
## ADI_communication_total  24.00  0.04    -0.68 0.62
## ADI_RRB_total             9.00  0.28    -0.50 0.25
## ados_2_SA_CSS             9.00 -0.19    -0.88 0.29
## ados_2_RRB_CSS            9.00 -0.37    -0.81 0.29
## SRS_tscore               47.00  0.10    -0.85 1.38
## SRS_tscore_self          38.00  0.38     0.46 1.27
## RBS_total                52.00  1.29     1.74 1.34
## SSP_total               115.00 -0.63    -0.36 3.93
## vabsdscoresc_dss         72.00  0.58     0.29 1.64
## vabsdscoresd_dss         81.00  0.26     0.23 1.86
## vabsdscoress_dss         84.00 -0.61     1.16 1.83
## vabsabcabc_standard      78.00  0.21     1.00 1.57
## ------------------------------------------------------------ 
## subgrp: SC_over_RRB
## dataset: Replication
##                         vars  n   mean    sd median trimmed   mad   min    max
## dataset*                   1 53    NaN    NA     NA     NaN    NA   Inf   -Inf
## subgrp*                    2 53    NaN    NA     NA     NaN    NA   Inf   -Inf
## age                        3 53  16.13  5.32  15.78   15.94  6.31  7.48  29.23
## meanFD                     4 53   0.23  0.21   0.16    0.20  0.11  0.04   1.31
## viq_all                    5 50  94.03 19.85  97.64   95.13 19.95 50.91 130.00
## piq_all                    6 52  96.80 21.98 101.50   97.99 19.98 44.03 138.00
## fsiq4_all                  7 51  96.21 20.47 102.00   96.81 20.76 59.00 139.00
## A_pct_severity             8 53   0.50  0.13   0.51    0.50  0.14  0.21   0.75
## B_pct_severity             9 53   0.20  0.12   0.21    0.19  0.13  0.00   0.47
## ADI_social_total          10 53  19.08  5.83  20.00   19.37  5.93  6.00  29.00
## ADI_communication_total   11 53  15.30  4.68  16.00   15.51  4.45  4.00  24.00
## ADI_RRB_total             12 53   3.91  2.60   4.00    3.77  2.97  0.00  10.00
## ados_2_SA_CSS             13 50   6.28  2.92   6.00    6.40  4.45  1.00  10.00
## ados_2_RRB_CSS            14 50   4.50  2.77   5.00    4.38  2.97  1.00   9.00
## SRS_tscore                15 45  76.60 10.42  80.00   77.38 11.86 51.00  90.00
## SRS_tscore_self           16 24  62.46 10.53  61.50   62.40 11.12 40.00  84.00
## RBS_total                 17 45  21.58 15.62  18.00   20.05 11.86  1.00  73.00
## SSP_total                 18 35 134.34 24.32 138.00  134.00 26.69 91.00 181.00
## vabsdscoresc_dss          19 48  70.56 15.00  70.50   71.30 10.38 21.00 102.00
## vabsdscoresd_dss          20 48  68.79 15.51  68.00   68.38 14.08 42.00 118.00
## vabsdscoress_dss          21 48  63.77 15.35  64.00   63.92 14.08 23.00 100.00
## vabsabcabc_standard       22 48  65.88 13.57  66.50   66.33  8.90 28.00  94.00
##                         range  skew kurtosis   se
## dataset*                 -Inf    NA       NA   NA
## subgrp*                  -Inf    NA       NA   NA
## age                     21.75  0.33    -0.81 0.73
## meanFD                   1.26  3.03    11.95 0.03
## viq_all                 79.09 -0.44    -0.62 2.81
## piq_all                 93.97 -0.46    -0.71 3.05
## fsiq4_all               80.00 -0.29    -0.93 2.87
## A_pct_severity           0.54 -0.14    -0.89 0.02
## B_pct_severity           0.47  0.28    -0.81 0.02
## ADI_social_total        23.00 -0.47    -0.63 0.80
## ADI_communication_total 20.00 -0.38    -0.64 0.64
## ADI_RRB_total           10.00  0.45    -0.71 0.36
## ados_2_SA_CSS            9.00 -0.18    -1.32 0.41
## ados_2_RRB_CSS           8.00 -0.12    -1.32 0.39
## SRS_tscore              39.00 -0.57    -0.68 1.55
## SRS_tscore_self         44.00  0.07    -0.40 2.15
## RBS_total               72.00  1.05     0.80 2.33
## SSP_total               90.00 -0.07    -0.90 4.11
## vabsdscoresc_dss        81.00 -0.82     1.92 2.17
## vabsdscoresd_dss        76.00  0.58     0.44 2.24
## vabsdscoress_dss        77.00 -0.16     0.14 2.22
## vabsabcabc_standard     66.00 -0.43     0.86 1.96
## ------------------------------------------------------------ 
## subgrp: TD
## dataset: Replication
##                         vars   n   mean    sd median trimmed   mad   min    max
## dataset*                   1 122    NaN    NA     NA     NaN    NA   Inf   -Inf
## subgrp*                    2 122    NaN    NA     NA     NaN    NA   Inf   -Inf
## age                        3 122  16.86  6.07  16.34   16.58  7.59  6.89  29.72
## meanFD                     4 122   0.23  0.46   0.14    0.15  0.07  0.04   4.60
## viq_all                    5 122 104.02 17.58 108.18  105.64 12.13 45.00 140.00
## piq_all                    6 122 104.64 18.41 108.96  106.56 14.08 49.00 139.00
## fsiq4_all                  7 122 104.94 17.14 108.09  107.29 11.86 50.00 134.00
## A_pct_severity             8   0    NaN    NA     NA     NaN    NA   Inf   -Inf
## B_pct_severity             9   0    NaN    NA     NA     NaN    NA   Inf   -Inf
## ADI_social_total          10   0    NaN    NA     NA     NaN    NA   Inf   -Inf
## ADI_communication_total   11   0    NaN    NA     NA     NaN    NA   Inf   -Inf
## ADI_RRB_total             12   0    NaN    NA     NA     NaN    NA   Inf   -Inf
## ados_2_SA_CSS             13   0    NaN    NA     NA     NaN    NA   Inf   -Inf
## ados_2_RRB_CSS            14   0    NaN    NA     NA     NaN    NA   Inf   -Inf
## SRS_tscore                15  65  47.23  9.34  44.00   45.66  4.45 37.00  90.00
## SRS_tscore_self           16  61  48.44  6.84  47.00   47.63  5.93 39.00  69.00
## RBS_total                 17  63   3.08 11.54   0.00    0.86  0.00  0.00  89.00
## SSP_total                 18  54 174.93 19.38 182.00  178.41  6.67 75.00 190.00
## vabsdscoresc_dss          19  39  92.74 25.45  96.00   95.82 20.76 21.00 125.00
## vabsdscoresd_dss          20  39  91.10 22.65  97.00   93.91 14.83 27.00 122.00
## vabsdscoress_dss          21  39  98.90 27.04 103.00  102.12 17.79 20.00 132.00
## vabsabcabc_standard       22  38  93.00 25.39 100.00   96.44 15.57 20.00 126.00
##                          range  skew kurtosis   se
## dataset*                  -Inf    NA       NA   NA
## subgrp*                   -Inf    NA       NA   NA
## age                      22.83  0.33    -0.97 0.55
## meanFD                    4.56  7.54    64.83 0.04
## viq_all                  95.00 -0.99     1.51 1.59
## piq_all                  90.00 -0.93     0.67 1.67
## fsiq4_all                84.00 -1.28     1.67 1.55
## A_pct_severity            -Inf    NA       NA   NA
## B_pct_severity            -Inf    NA       NA   NA
## ADI_social_total          -Inf    NA       NA   NA
## ADI_communication_total   -Inf    NA       NA   NA
## ADI_RRB_total             -Inf    NA       NA   NA
## ados_2_SA_CSS             -Inf    NA       NA   NA
## ados_2_RRB_CSS            -Inf    NA       NA   NA
## SRS_tscore               53.00  2.14     5.82 1.16
## SRS_tscore_self          30.00  1.05     0.48 0.88
## RBS_total                89.00  6.60    45.89 1.45
## SSP_total               115.00 -2.88    10.92 2.64
## vabsdscoresc_dss        104.00 -1.21     1.00 4.08
## vabsdscoresd_dss         95.00 -1.34     1.26 3.63
## vabsdscoress_dss        112.00 -1.26     1.10 4.33
## vabsabcabc_standard     106.00 -1.42     1.40 4.12
#------------------------------------------------------------------------------
# Table of subtypes X sex

# Discovery
data2use = subset(data2write,data2write$dataset=="Discovery")
table(data2use$subgrp,data2use$sex)
##               
##                Female Male
##   RRB_over_SC       4    4
##   SC_equal_RRB     17   53
##   SC_over_RRB      14   41
##   TD               41   80
cs_res = chisq.test(data2use$subgrp,data2use$sex)
cs_res
## 
##  Pearson's Chi-squared test
## 
## data:  data2use$subgrp and data2use$sex
## X-squared = 4.028, df = 3, p-value = 0.2585
# Replication
data2use = subset(data2write,data2write$dataset=="Replication")
table(data2use$subgrp,data2use$sex)
##               
##                Female Male
##   RRB_over_SC       1    1
##   SC_equal_RRB     22   56
##   SC_over_RRB      13   40
##   TD               47   75
cs_res = chisq.test(data2use$subgrp,data2use$sex)
cs_res
## 
##  Pearson's Chi-squared test
## 
## data:  data2use$subgrp and data2use$sex
## X-squared = 4.4851, df = 3, p-value = 0.2136
#------------------------------------------------------------------------------
vars2analyze = c("age","meanFD","viq_all","piq_all","fsiq4_all",
                 "A_pct_severity","B_pct_severity",
                 "ADI_social_total","ADI_communication_total","ADI_RRB_total",
                 "ados_2_SA_CSS","ados_2_RRB_CSS",
                 "SRS_tscore_self","RBS_total","SSP_total",
                 "vabsdscoress_dss","vabsdscoresd_dss","vabsdscoresc_dss","vabsabcabc_standard")

vnames = c("Age","Mean FD","VIQ","PIQ","FIQ","ADI-R SC","ADI-R RRB","ADI-R Social","ADI-R Communication","ADI-R RRB","ADOS SA CSS","ADOS RRB CSS","SRS","RBS","SSP","Vineland Socialization","Vineland Daily Living Skills","Vineland Communication","Vineland ABC")

cnames = c("All_Disc.fstat","All_Disc.pval","All_Rep.fstat","All_Rep.pval",
           "SCequalRRB_vs_SCoverRRB_Disc.fstat","SCequalRRB_vs_SCoverRRB_Disc.tstat",
           "SCequalRRB_vs_SCoverRRB_Disc.pval","SCequalRRB_vs_SCoverRRB_Disc.es",
           "SCequalRRB_vs_SCoverRRB_Rep.fstat","SCequalRRB_vs_SCoverRRB_Rep.tstat",
           "SCequalRRB_vs_SCoverRRB_Rep.pval","SCequalRRB_vs_SCoverRRB_Rep.es",
           "SCequalRRB_vs_SCoverRRB.repBF")

output_res = data.frame(matrix(nrow = length(vars2analyze),ncol = length(cnames)))
colnames(output_res) = cnames
rownames(output_res) = vars2analyze
output_res$varNames = vars2analyze

for (ivar in 1:length(vars2analyze)){

  y_var = vars2analyze[ivar]
  # print(y_var)
  #----------------------------------------------------------------------------
  # Discovery
  df4mod = subset(df2use, df2use$dataset=="Discovery")

  # construct linear model
  # mixed-effect model: site as random factor, all other covariates as fixed factors
  fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
  rx_form = as.formula(sprintf("~ 1|%s","Centre"))
  mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))

  # run ANOVA
  res = anova(mod2use)
  output_res[y_var,"All_Disc.fstat"] = res["subgrp","F-value"]
  output_res[y_var,"All_Disc.pval"] = res["subgrp","p-value"]

  #----------------------------------------------------------------------------
  # Replication
  df4mod = subset(df2use, df2use$dataset=="Replication")

  # construct linear model
  # mixed-effect model: site as random factor, all other covariates as fixed factors
  fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
  rx_form = as.formula(sprintf("~ 1|%s","Centre"))
  mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))

  # run ANOVA
  res = anova(mod2use)
  output_res[y_var,"All_Rep.fstat"] = res["subgrp","F-value"]
  output_res[y_var,"All_Rep.pval"] = res["subgrp","p-value"]

  #----------------------------------------------------------------------------
  # Discovery
  df4mod = subset(df2use, df2use$dataset=="Discovery" & !is.element(df2use$subgrp,c("TD","RRB_over_SC")))
  n1 = sum(df4mod$subgrp=="SC_equal_RRB")
  m1 = sum(df4mod$subgrp=="SC_over_RRB")

  # construct linear model
  # mixed-effect model: site as random factor, all other covariates as fixed factors
  fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
  rx_form = as.formula(sprintf("~ 1|%s","Centre"))
  mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))

  # run ANOVA
  res = anova(mod2use)
  output_res[y_var,"SCequalRRB_vs_SCoverRRB_Disc.fstat"] = res["subgrp","F-value"]
  output_res[y_var,"SCequalRRB_vs_SCoverRRB_Disc.pval"] = res["subgrp","p-value"]
  res = summary(mod2use)
  output_res[y_var,"SCequalRRB_vs_SCoverRRB_Disc.tstat"] = res$tTable[2,"t-value"]
  output_res[y_var,"SCequalRRB_vs_SCoverRRB_Disc.es"] = cohens_d(df4mod[df4mod$subgrp=="SC_equal_RRB",y_var],
                                                                 df4mod[df4mod$subgrp=="SC_over_RRB",y_var])

  #----------------------------------------------------------------------------
  # Replication
  df4mod = subset(df2use, df2use$dataset=="Replication" & !is.element(df2use$subgrp,c("TD","RRB_over_SC")))
  n2 = sum(df4mod$subgrp=="SC_equal_RRB")
  m2 = sum(df4mod$subgrp=="SC_over_RRB")

  # construct linear model
  # mixed-effect model: site as random factor, all other covariates as fixed factors
  fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
  rx_form = as.formula(sprintf("~ 1|%s","Centre"))
  mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))

  # run ANOVA
  res = anova(mod2use)
  output_res[y_var,"SCequalRRB_vs_SCoverRRB_Rep.fstat"] = res["subgrp","F-value"]
  output_res[y_var,"SCequalRRB_vs_SCoverRRB_Rep.pval"] = res["subgrp","p-value"]
  res = summary(mod2use)
  output_res[y_var,"SCequalRRB_vs_SCoverRRB_Rep.tstat"] = res$tTable[2,"t-value"]
  output_res[y_var,"SCequalRRB_vs_SCoverRRB_Rep.es"] = cohens_d(df4mod[df4mod$subgrp=="SC_equal_RRB",y_var],
                                                                 df4mod[df4mod$subgrp=="SC_over_RRB",y_var])

  #----------------------------------------------------------------------------
  # Replication Bayes Factor
  res_bf = BFSALL(tobs = output_res[y_var,"SCequalRRB_vs_SCoverRRB_Disc.tstat"],
                  trep = output_res[y_var,"SCequalRRB_vs_SCoverRRB_Disc.tstat"],
                  n1 = n1,
                  n2 = n2,
                  m1 = m1,
                  m2 = m2,
                  sample = 2,
                  Type = 'ALL')
  output_res[y_var,"SCequalRRB_vs_SCoverRRB.repBF"] = res_bf["Replication BF","Replication 1"]

  # make a plot
  colors2use = get_ggColorHue(3)
  df4plot = subset(df2use, df2use$subgrp!="RRB_over_SC")
  p = ggplot(data = df4plot, aes_string(x = "subgrp", y = y_var, colour = "subgrp")) + facet_grid(. ~ dataset)
  p = p + geom_jitter() + geom_boxplot(fill = NA, colour = "#000000", outlier.shape = NA)
  p = p + ylab(vnames[ivar]) + xlab("Group") +
    scale_x_discrete(labels=c("SC_equal_RRB"="SC=RRB","SC_over_RRB"="SC>RRB","TD"="TD")) +
                       scale_colour_manual(values = c(colors2use[2:3],"grey60")) + guides(colour=FALSE) +
    theme(text = element_text(size=fontSize-5),
        axis.text.x = element_text(size=fontSize-5),
        axis.text.y = element_text(size=fontSize-5))
  print(p)

}

vabc["0.9","Discovery"] = output_res["vabsabcabc_standard","SCequalRRB_vs_SCoverRRB_Disc.es"]
vabc["0.9","Replication"] = output_res["vabsabcabc_standard","SCequalRRB_vs_SCoverRRB_Rep.es"]
vabc_dls["0.9","Discovery"] = output_res["vabsdscoresd_dss","SCequalRRB_vs_SCoverRRB_Disc.es"]
vabc_dls["0.9","Replication"] = output_res["vabsdscoresd_dss","SCequalRRB_vs_SCoverRRB_Rep.es"]
output_res
##                         All_Disc.fstat All_Disc.pval All_Rep.fstat All_Rep.pval
## age                          0.2855127  8.358420e-01    0.90805525 4.377311e-01
## meanFD                       2.3602889  7.205717e-02    0.08650023 9.673869e-01
## viq_all                      2.4328749  6.562775e-02    3.38300738 1.888191e-02
## piq_all                      1.5615896  1.993320e-01    1.20983913 3.067449e-01
## fsiq4_all                    2.3638575  7.174689e-02    2.06813815 1.050238e-01
## A_pct_severity              26.9658312  1.735245e-10   55.13216638 0.000000e+00
## B_pct_severity              29.7710577  2.495759e-11    2.34544394 9.994192e-02
## ADI_social_total             1.4890676  2.294944e-01    8.91634507 2.379619e-04
## ADI_communication_total      1.5868589  2.085961e-01    9.48327261 1.450273e-04
## ADI_RRB_total               26.0311927  3.355514e-10    0.95382609 3.880064e-01
## ados_2_SA_CSS                3.6051473  3.007004e-02    1.35353157 2.621776e-01
## ados_2_RRB_CSS               1.0836446  3.415424e-01    0.86272207 4.245701e-01
## SRS_tscore_self             36.4014535  0.000000e+00   48.01444961 6.661338e-16
## RBS_total                   20.8275522  1.532618e-11   17.79086787 4.331766e-10
## SSP_total                   31.8680712  1.665335e-15   24.34024007 1.511902e-12
## vabsdscoress_dss            23.1160444  2.343903e-12   24.54698034 5.052625e-13
## vabsdscoresd_dss            12.2992810  3.105951e-07   10.60071813 2.272064e-06
## vabsdscoresc_dss             9.8474547  5.733993e-06   10.38141779 2.937494e-06
## vabsabcabc_standard         18.5322400  2.855782e-10   17.01360570 1.429190e-09
##                         SCequalRRB_vs_SCoverRRB_Disc.fstat
## age                                           7.805025e-02
## meanFD                                        8.979743e-01
## viq_all                                       5.476685e-02
## piq_all                                       5.165830e-03
## fsiq4_all                                     1.504420e-02
## A_pct_severity                                3.501725e+01
## B_pct_severity                                3.781501e+01
## ADI_social_total                              2.155285e+00
## ADI_communication_total                       2.939368e+00
## ADI_RRB_total                                 2.847948e+01
## ados_2_SA_CSS                                 5.068942e-04
## ados_2_RRB_CSS                                3.652838e-01
## SRS_tscore_self                               6.311839e-02
## RBS_total                                     3.108287e+00
## SSP_total                                     2.274808e+00
## vabsdscoress_dss                              2.312769e+00
## vabsdscoresd_dss                              2.297887e+00
## vabsdscoresc_dss                              6.674736e-01
## vabsabcabc_standard                           3.406295e+00
##                         SCequalRRB_vs_SCoverRRB_Disc.tstat
## age                                             0.27937474
## meanFD                                         -0.94761507
## viq_all                                         0.23402318
## piq_all                                        -0.07187371
## fsiq4_all                                       0.12265479
## A_pct_severity                                  5.91753787
## B_pct_severity                                 -6.14939138
## ADI_social_total                                1.46808886
## ADI_communication_total                         1.71445846
## ADI_RRB_total                                  -5.33661688
## ados_2_SA_CSS                                   0.02251431
## ados_2_RRB_CSS                                 -0.60438710
## SRS_tscore_self                                 0.25123374
## RBS_total                                      -1.76303352
## SSP_total                                       1.50824657
## vabsdscoress_dss                               -1.52077905
## vabsdscoresd_dss                               -1.51587821
## vabsdscoresc_dss                               -0.81699058
## vabsabcabc_standard                            -1.84561518
##                         SCequalRRB_vs_SCoverRRB_Disc.pval
## age                                          7.804382e-01
## meanFD                                       3.452298e-01
## viq_all                                      8.153725e-01
## piq_all                                      9.428241e-01
## fsiq4_all                                    9.025856e-01
## A_pct_severity                               3.170386e-08
## B_pct_severity                               1.054411e-08
## ADI_social_total                             1.446964e-01
## ADI_communication_total                      8.902613e-02
## ADI_RRB_total                                4.539420e-07
## ados_2_SA_CSS                                9.820760e-01
## ados_2_RRB_CSS                               5.467565e-01
## SRS_tscore_self                              8.026429e-01
## RBS_total                                    8.091942e-02
## SSP_total                                    1.359286e-01
## vabsdscoress_dss                             1.311086e-01
## vabsdscoresd_dss                             1.323675e-01
## vabsdscoresc_dss                             4.156530e-01
## vabsabcabc_standard                          6.758935e-02
##                         SCequalRRB_vs_SCoverRRB_Disc.es
## age                                       -0.0491267306
## meanFD                                     0.1707483830
## viq_all                                   -0.0618981388
## piq_all                                    0.0175129387
## fsiq4_all                                 -0.0221008674
## A_pct_severity                            -0.8963307629
## B_pct_severity                             1.0979606772
## ADI_social_total                          -0.1601300150
## ADI_communication_total                   -0.1549178638
## ADI_RRB_total                              0.9864464663
## ados_2_SA_CSS                              0.0009583416
## ados_2_RRB_CSS                             0.1270034368
## SRS_tscore_self                            0.1041855065
## RBS_total                                  0.3735177224
## SSP_total                                 -0.2913929275
## vabsdscoress_dss                           0.2078098782
## vabsdscoresd_dss                           0.2012499007
## vabsdscoresc_dss                           0.1468494088
## vabsabcabc_standard                        0.2736410750
##                         SCequalRRB_vs_SCoverRRB_Rep.fstat
## age                                            0.33755002
## meanFD                                         0.18055094
## viq_all                                        5.18677068
## piq_all                                        2.59636798
## fsiq4_all                                      3.33876047
## A_pct_severity                               105.88313587
## B_pct_severity                                 2.57049807
## ADI_social_total                              17.31772979
## ADI_communication_total                       16.08830244
## ADI_RRB_total                                  0.01960920
## ados_2_SA_CSS                                  2.67460927
## ados_2_RRB_CSS                                 1.75112047
## SRS_tscore_self                                0.03853939
## RBS_total                                     14.57591064
## SSP_total                                      2.48866761
## vabsdscoress_dss                              19.68832383
## vabsdscoresd_dss                              11.51435205
## vabsdscoresc_dss                              18.65262122
## vabsabcabc_standard                           22.61237695
##                         SCequalRRB_vs_SCoverRRB_Rep.tstat
## age                                            -0.5809906
## meanFD                                         -0.4249129
## viq_all                                        -2.2774483
## piq_all                                        -1.6113249
## fsiq4_all                                      -1.8272275
## A_pct_severity                                 10.2899532
## B_pct_severity                                 -1.6032773
## ADI_social_total                                4.1614577
## ADI_communication_total                         4.0110226
## ADI_RRB_total                                   0.1400329
## ados_2_SA_CSS                                   1.6354233
## ados_2_RRB_CSS                                 -1.3232991
## SRS_tscore_self                                 0.1963145
## RBS_total                                       3.8178411
## SSP_total                                      -1.5775511
## vabsdscoress_dss                               -4.4371527
## vabsdscoresd_dss                               -3.3932804
## vabsdscoresc_dss                               -4.3188680
## vabsabcabc_standard                            -4.7552473
##                         SCequalRRB_vs_SCoverRRB_Rep.pval
## age                                         5.622846e-01
## meanFD                                      6.716249e-01
## viq_all                                     2.450063e-02
## piq_all                                     1.096515e-01
## fsiq4_all                                   7.008981e-02
## A_pct_severity                              0.000000e+00
## B_pct_severity                              1.113773e-01
## ADI_social_total                            5.811897e-05
## ADI_communication_total                     1.029590e-04
## ADI_RRB_total                               8.888576e-01
## ados_2_SA_CSS                               1.045591e-01
## ados_2_RRB_CSS                              1.882310e-01
## SRS_tscore_self                             8.450502e-01
## RBS_total                                   2.249553e-04
## SSP_total                                   1.188223e-01
## vabsdscoress_dss                            2.114563e-05
## vabsdscoresd_dss                            9.529960e-04
## vabsdscoresc_dss                            3.363672e-05
## vabsabcabc_standard                         5.891804e-06
##                         SCequalRRB_vs_SCoverRRB_Rep.es
## age                                        0.103423565
## meanFD                                     0.075639786
## viq_all                                    0.449543002
## piq_all                                    0.364797390
## fsiq4_all                                  0.393005034
## A_pct_severity                            -1.743934905
## B_pct_severity                             0.299146327
## ADI_social_total                          -0.688004039
## ADI_communication_total                   -0.659044077
## ADI_RRB_total                             -0.003443901
## ados_2_SA_CSS                             -0.260690681
## ados_2_RRB_CSS                             0.154522997
## SRS_tscore_self                           -0.050543903
## RBS_total                                 -0.669750517
## SSP_total                                  0.345836573
## vabsdscoress_dss                           0.802957555
## vabsdscoresd_dss                           0.631392506
## vabsdscoresc_dss                           0.806944461
## vabsabcabc_standard                        0.862310814
##                         SCequalRRB_vs_SCoverRRB.repBF                varNames
## age                                      7.313631e-01                     age
## meanFD                                   1.100533e+00                  meanFD
## viq_all                                  7.216283e-01                 viq_all
## piq_all                                  7.047582e-01                 piq_all
## fsiq4_all                                7.081294e-01               fsiq4_all
## A_pct_severity                           3.968454e+06          A_pct_severity
## B_pct_severity                           1.180802e+07          B_pct_severity
## ADI_social_total                         2.055481e+00        ADI_social_total
## ADI_communication_total                  3.023825e+00 ADI_communication_total
## ADI_RRB_total                            2.839841e+05           ADI_RRB_total
## ados_2_SA_CSS                            7.026229e-01           ados_2_SA_CSS
## ados_2_RRB_CSS                           8.431448e-01          ados_2_RRB_CSS
## SRS_tscore_self                          7.252645e-01         SRS_tscore_self
## RBS_total                                3.279224e+00               RBS_total
## SSP_total                                2.182111e+00               SSP_total
## vabsdscoress_dss                         2.216565e+00        vabsdscoress_dss
## vabsdscoresd_dss                         2.203469e+00        vabsdscoresd_dss
## vabsdscoresc_dss                         9.809837e-01        vabsdscoresc_dss
## vabsabcabc_standard                      3.795925e+00     vabsabcabc_standard

SC-RRB difference z = 1

#------------------------------------------------------------------------------
# Z-score threshold to use for subtyping
z_thresh = 1

# vars2use = c("dbaes_atotal","dbaes_btotal")

# compute Discovery mean and sd
ds_disc = Dverbal_Discovery[,vars2use[1]] - Dverbal_Discovery[,vars2use[2]]
mean2use = mean(ds_disc)
sd2use = sd(ds_disc)

Dverbal_Discovery = make_subtype(data2use = Dverbal_Discovery,
                                 z_thresh = z_thresh,
                                 mean2use = mean2use,
                                 sd2use = sd2use)

# compute Replication mean and sd
ds_rep = Dverbal_Replication[,vars2use[1]] - Dverbal_Replication[,vars2use[2]]
mean2use = mean(ds_rep)
sd2use = sd(ds_rep)

Dverbal_Replication = make_subtype(data2use = Dverbal_Replication,
                                 z_thresh = z_thresh,
                                 mean2use = mean2use,
                                 sd2use = sd2use)

#------------------------------------------------------------------------------
# Table of subtypes X sex

# Discovery
table(Dverbal_Discovery$z_ds_group,Dverbal_Discovery$sex)
##               
##                  F   M
##   RRB_over_SC   26 115
##   SC_equal_RRB 143 468
##   SC_over_RRB   28 109
cs_res = chisq.test(Dverbal_Discovery$z_ds_group,Dverbal_Discovery$sex)
cs_res
## 
##  Pearson's Chi-squared test
## 
## data:  Dverbal_Discovery$z_ds_group and Dverbal_Discovery$sex
## X-squared = 1.9153, df = 2, p-value = 0.3838
# Replication
table(Dverbal_Replication$z_ds_group,Dverbal_Replication$sex)
##               
##                  F   M
##   RRB_over_SC   22 115
##   SC_equal_RRB 147 482
##   SC_over_RRB   27  97
cs_res = chisq.test(Dverbal_Replication$z_ds_group,Dverbal_Replication$sex)
cs_res
## 
##  Pearson's Chi-squared test
## 
## data:  Dverbal_Replication$z_ds_group and Dverbal_Replication$sex
## X-squared = 3.5077, df = 2, p-value = 0.1731
#------------------------------------------------------------------------------
# Descriptive stats

# Discovery
df2use = Dverbal_Discovery[,c("subjectkey","dataset_id","collection_id","interview_age","sex","z_ds_group","ados_age","ados_sa_css","ados_rrb_css","iq","dbaes_atotal","dbaes_btotal")]
df2use$age = df2use$interview_age/12

cols2use = c("z_ds_group","sex","age","ados_age","ados_sa_css","ados_rrb_css","iq","dbaes_atotal","dbaes_btotal")
res=describeBy(x = df2use[,cols2use], group = "z_ds_group")
res
## 
##  Descriptive statistics by group 
## group: RRB_over_SC
##              vars   n   mean    sd median trimmed   mad   min    max  range
## z_ds_group*     1 141   1.00  0.00   1.00    1.00  0.00  1.00   1.00   0.00
## sex*            2 141    NaN    NA     NA     NaN    NA   Inf   -Inf   -Inf
## age             3 141  10.05  4.02   9.33    9.76  3.34  3.00  22.58  19.58
## ados_age        4  19  86.95 34.46  84.00   84.94 26.69 37.00 171.00 134.00
## ados_sa_css     5  19   6.11  2.49   7.00    6.12  2.97  2.00  10.00   8.00
## ados_rrb_css    6  19   7.37  2.29   8.00    7.59  1.48  1.00  10.00   9.00
## iq              7  39 102.69 14.59 102.00  103.03 16.31 67.00 139.00  72.00
## dbaes_atotal    8 141   0.21  0.10   0.20    0.20  0.11  0.01   0.45   0.44
## dbaes_btotal    9 141   0.49  0.12   0.49    0.49  0.13  0.26   0.79   0.53
##               skew kurtosis   se
## z_ds_group*    NaN      NaN 0.00
## sex*            NA       NA   NA
## age           0.70     0.14 0.34
## ados_age      0.69    -0.02 7.91
## ados_sa_css  -0.19    -1.35 0.57
## ados_rrb_css -1.14     0.78 0.53
## iq           -0.20     0.18 2.34
## dbaes_atotal  0.15    -0.79 0.01
## dbaes_btotal  0.22    -0.72 0.01
## ------------------------------------------------------------ 
## group: SC_equal_RRB
##              vars   n   mean    sd median trimmed   mad min    max  range  skew
## z_ds_group*     1 611   2.00  0.00   2.00    2.00  0.00   2   2.00   0.00   NaN
## sex*            2 611    NaN    NA     NA     NaN    NA Inf   -Inf   -Inf    NA
## age             3 611   9.03  5.38   8.08    8.29  4.69   0  45.75  45.75  1.96
## ados_age        4  99  84.88 45.80  73.00   80.72 50.41  27 202.00 175.00  0.64
## ados_sa_css     5  99   6.92  2.01   7.00    7.01  1.48   1  10.00   9.00 -0.37
## ados_rrb_css    6  99   7.68  2.30   8.00    8.06  1.48   1  10.00   9.00 -1.57
## iq              7 142 103.89 18.97 107.00  105.53 17.79  40 138.00  98.00 -0.95
## dbaes_atotal    8 611   0.31  0.14   0.31    0.31  0.14   0   0.70   0.70  0.01
## dbaes_btotal    9 611   0.32  0.14   0.32    0.32  0.14   0   0.68   0.68 -0.05
##              kurtosis   se
## z_ds_group*       NaN 0.00
## sex*               NA   NA
## age              6.69 0.22
## ados_age        -0.85 4.60
## ados_sa_css     -0.18 0.20
## ados_rrb_css     2.33 0.23
## iq               1.28 1.59
## dbaes_atotal    -0.32 0.01
## dbaes_btotal    -0.33 0.01
## ------------------------------------------------------------ 
## group: SC_over_RRB
##              vars   n   mean    sd median trimmed   mad   min    max  range
## z_ds_group*     1 137   3.00  0.00   3.00    3.00  0.00  3.00   3.00   0.00
## sex*            2 137    NaN    NA     NA     NaN    NA   Inf   -Inf   -Inf
## age             3 137   7.24  5.50   5.50    6.19  2.84  1.67  37.33  35.67
## ados_age        4  35  61.69 26.99  62.00   58.38 23.72 30.00 172.00 142.00
## ados_sa_css     5  35   7.46  1.60   7.00    7.48  1.48  4.00  10.00   6.00
## ados_rrb_css    6  35   8.37  1.31   8.00    8.41  1.48  5.00  10.00   5.00
## iq              7  18 106.83 17.10 111.50  106.50 17.79 79.00 140.00  61.00
## dbaes_atotal    8 137   0.51  0.13   0.50    0.51  0.13  0.20   0.87   0.67
## dbaes_btotal    9 137   0.21  0.10   0.21    0.21  0.09  0.00   0.47   0.47
##               skew kurtosis   se
## z_ds_group*    NaN      NaN 0.00
## sex*            NA       NA   NA
## age           2.50     7.77 0.47
## ados_age      1.89     5.55 4.56
## ados_sa_css  -0.11    -0.88 0.27
## ados_rrb_css -0.30    -0.76 0.22
## iq            0.05    -1.12 4.03
## dbaes_atotal  0.27    -0.24 0.01
## dbaes_btotal  0.15    -0.12 0.01
# Replication
df2use = Dverbal_Replication[,c("subjectkey","dataset_id","collection_id","interview_age","sex","z_ds_group","ados_age","ados_sa_css","ados_rrb_css","iq","dbaes_atotal","dbaes_btotal")]
df2use$age = df2use$interview_age/12

cols2use = c("z_ds_group","sex","age","ados_age","ados_sa_css","ados_rrb_css","iq","dbaes_atotal","dbaes_btotal")
res=describeBy(x = df2use[,cols2use], group = "z_ds_group")
res
## 
##  Descriptive statistics by group 
## group: RRB_over_SC
##              vars   n   mean    sd median trimmed   mad   min    max  range
## z_ds_group*     1 137   1.00  0.00   1.00    1.00  0.00  1.00   1.00   0.00
## sex*            2 137    NaN    NA     NA     NaN    NA   Inf   -Inf   -Inf
## age             3 137   9.76  4.35   9.42    9.39  3.83  3.17  28.58  25.42
## ados_age        4   8 112.00 52.41 100.00  112.00 58.56 39.00 189.00 150.00
## ados_sa_css     5   8   6.62  2.39   7.00    6.62  2.22  3.00   9.00   6.00
## ados_rrb_css    6   8   6.62  1.06   7.00    6.62  0.00  5.00   8.00   3.00
## iq              7  40 106.92 15.88 105.50  105.66 14.08 70.00 152.00  82.00
## dbaes_atotal    8 137   0.21  0.10   0.21    0.21  0.09  0.01   0.51   0.50
## dbaes_btotal    9 137   0.52  0.12   0.51    0.51  0.11  0.24   0.93   0.69
##               skew kurtosis    se
## z_ds_group*    NaN      NaN  0.00
## sex*            NA       NA    NA
## age           1.19     2.73  0.37
## ados_age      0.13    -1.71 18.53
## ados_sa_css  -0.60    -1.41  0.84
## ados_rrb_css -0.60    -1.26  0.37
## iq            0.57     0.61  2.51
## dbaes_atotal  0.60     0.57  0.01
## dbaes_btotal  0.67     0.97  0.01
## ------------------------------------------------------------ 
## group: SC_equal_RRB
##              vars   n   mean    sd median trimmed   mad min    max  range  skew
## z_ds_group*     1 629   2.00  0.00   2.00    2.00  0.00   2   2.00   0.00   NaN
## sex*            2 629    NaN    NA     NA     NaN    NA Inf   -Inf   -Inf    NA
## age             3 629   9.10  5.54   8.00    8.27  4.57   0  40.92  40.92  1.74
## ados_age        4 119  79.42 37.97  71.00   75.51 41.51  35 196.00 161.00  0.83
## ados_sa_css     5 119   6.93  2.04   7.00    6.98  1.48   2  10.00   8.00 -0.06
## ados_rrb_css    6 119   7.40  2.30   8.00    7.71  1.48   1  10.00   9.00 -1.31
## iq              7 135 105.44 17.96 108.00  105.90 16.31  57 146.00  89.00 -0.30
## dbaes_atotal    8 629   0.31  0.14   0.31    0.31  0.14   0   0.78   0.78  0.04
## dbaes_btotal    9 629   0.33  0.14   0.33    0.33  0.14   0   0.81   0.81  0.12
##              kurtosis   se
## z_ds_group*       NaN 0.00
## sex*               NA   NA
## age              4.28 0.22
## ados_age         0.00 3.48
## ados_sa_css     -0.83 0.19
## ados_rrb_css     1.50 0.21
## iq               0.23 1.55
## dbaes_atotal    -0.20 0.01
## dbaes_btotal     0.18 0.01
## ------------------------------------------------------------ 
## group: SC_over_RRB
##              vars   n   mean    sd median trimmed   mad   min    max  range
## z_ds_group*     1 124   3.00  0.00   3.00    3.00  0.00  3.00   3.00   0.00
## sex*            2 124    NaN    NA     NA     NaN    NA   Inf   -Inf   -Inf
## age             3 124   6.92  5.14   5.25    5.93  2.72  2.08  28.50  26.42
## ados_age        4  26  66.15 24.50  59.50   63.82 22.98 30.00 141.00 111.00
## ados_sa_css     5  26   6.96  1.73   6.50    6.95  2.22  3.00  10.00   7.00
## ados_rrb_css    6  26   7.42  2.55   8.00    7.77  2.97  1.00  10.00   9.00
## iq              7  11 117.27 13.76 119.00  118.78 13.34 87.00 134.00  47.00
## dbaes_atotal    8 124   0.53  0.13   0.52    0.52  0.13  0.25   0.96   0.71
## dbaes_btotal    9 124   0.21  0.11   0.20    0.21  0.11  0.00   0.50   0.50
##               skew kurtosis   se
## z_ds_group*    NaN      NaN 0.00
## sex*            NA       NA   NA
## age           2.35     6.07 0.46
## ados_age      1.06     1.15 4.81
## ados_sa_css   0.01    -0.68 0.34
## ados_rrb_css -1.10     0.50 0.50
## iq           -0.81    -0.42 4.15
## dbaes_atotal  0.31     0.20 0.01
## dbaes_btotal  0.22    -0.43 0.01
#------------------------------------------------------------------------------
# Tests of differences in interview age across the subtypes

# Discovery
mod2use = lm(formula = interview_age ~ z_ds_group, data = Dverbal_Discovery)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: interview_age
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## z_ds_group   2   82914   41457  10.618 2.772e-05 ***
## Residuals  886 3459152    3904                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Replication
mod2use = lm(formula = interview_age ~ z_ds_group, data = Dverbal_Replication)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: interview_age
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## z_ds_group   2   88049   44025  10.801 2.319e-05 ***
## Residuals  887 3615315    4076                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#------------------------------------------------------------------------------
# Tests of differences in SC across the subtypes

# Discovery
mod2use = lm(formula = dbaes_atotal ~ z_ds_group, data = Dverbal_Discovery)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: dbaes_atotal
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## z_ds_group   2  7.1565  3.5782  212.57 < 2.2e-16 ***
## Residuals  886 14.9143  0.0168                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Replication
mod2use = lm(formula = dbaes_atotal ~ z_ds_group, data = Dverbal_Replication)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: dbaes_atotal
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## z_ds_group   2  6.7942  3.3971  191.67 < 2.2e-16 ***
## Residuals  887 15.7212  0.0177                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#------------------------------------------------------------------------------
# Tests of differences in RRB across the subtypes

# Discovery
mod2use = lm(formula = dbaes_btotal ~ z_ds_group, data = Dverbal_Discovery)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: dbaes_btotal
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## z_ds_group   2  5.7364 2.86820  174.19 < 2.2e-16 ***
## Residuals  886 14.5884 0.01647                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Replication
mod2use = lm(formula = dbaes_btotal ~ z_ds_group, data = Dverbal_Replication)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: dbaes_btotal
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## z_ds_group   2  6.5659  3.2830  184.74 < 2.2e-16 ***
## Residuals  887 15.7627  0.0178                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#------------------------------------------------------------------------------
# Tests of differences in ADOS SA CSS across the subtypes

# Discovery
mod2use = lm(formula = ados_sa_css ~ z_ds_group, data = Dverbal_Discovery)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: ados_sa_css
##             Df Sum Sq Mean Sq F value  Pr(>F)  
## z_ds_group   2  22.64 11.3209    2.85 0.06098 .
## Residuals  150 595.83  3.9722                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Replication
mod2use = lm(formula = ados_sa_css ~ z_ds_group, data = Dverbal_Replication)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: ados_sa_css
##             Df Sum Sq Mean Sq F value Pr(>F)
## z_ds_group   2   0.76  0.3801   0.094 0.9103
## Residuals  150 606.30  4.0420
#------------------------------------------------------------------------------
# Tests of differences in ADOS RRB CSS across the subtypes

# Discovery
mod2use = lm(formula = ados_rrb_css ~ z_ds_group, data = Dverbal_Discovery)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: ados_rrb_css
##             Df Sum Sq Mean Sq F value Pr(>F)
## z_ds_group   2  16.47  8.2350  1.8375 0.1628
## Residuals  150 672.25  4.4817
# Replication
mod2use = lm(formula = ados_rrb_css ~ z_ds_group, data = Dverbal_Replication)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: ados_rrb_css
##             Df Sum Sq Mean Sq F value Pr(>F)
## z_ds_group   2   4.64  2.3217  0.4392 0.6453
## Residuals  150 792.86  5.2857
#------------------------------------------------------------------------------
# Tests of differences in IQ across the subtypes

# Discovery
mod2use = lm(formula = iq ~ z_ds_group, data = Dverbal_Discovery)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: iq
##             Df Sum Sq Mean Sq F value Pr(>F)
## z_ds_group   2    212  105.85  0.3251 0.7228
## Residuals  196  63817  325.60
# Replication
mod2use = lm(formula = iq ~ z_ds_group, data = Dverbal_Replication)
anova(mod2use)
## Analysis of Variance Table
## 
## Response: iq
##             Df Sum Sq Mean Sq F value  Pr(>F)  
## z_ds_group   2   1434  716.97  2.3868 0.09478 .
## Residuals  183  54972  300.40                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#------------------------------------------------------------------------------
# Make scatterplots with difference score Z subtypes in different colors
maxScores = c(3,4)

p_disc = ggplot(data = Dverbal_Discovery, aes(x = dbaes_atotal, y = dbaes_btotal, colour = z_ds_group)) + geom_point() + xlab("SC") + ylab("RRB") + ylim(0,1) + xlim(0,1) + ggtitle("NDAR Discovery")
p1_top_left = p_disc + guides(colour=FALSE)
ggsave(filename = file.path(plotpath, sprintf("final_NDAR_Disc_scatterplot_z%s.pdf",as.character(z_thresh))), plot = p1_top_left)
p_disc

table(Dverbal_Discovery$z_ds_group)
## 
##  RRB_over_SC SC_equal_RRB  SC_over_RRB 
##          141          611          137
cor_res = cor.test(Dverbal_Discovery$dbaes_atotal, Dverbal_Discovery$dbaes_btotal)
cor_res
## 
##  Pearson's product-moment correlation
## 
## data:  Dverbal_Discovery$dbaes_atotal and Dverbal_Discovery$dbaes_btotal
## t = 6.6829, df = 887, p-value = 4.134e-11
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1554332 0.2806578
## sample estimates:
##       cor 
## 0.2189469
p_rep = ggplot(data = Dverbal_Replication, aes(x = dbaes_atotal, y = dbaes_btotal, colour = z_ds_group)) + geom_point() + xlab("SC") + ylab("RRB") + ylim(0,1) + xlim(0,1) +  ggtitle("NDAR Replication")
p2_bottom_left = p_rep + guides(colour=FALSE)
ggsave(filename = file.path(plotpath, sprintf("final_NDAR_Rep_scatterplot_z%s.pdf",as.character(z_thresh))), plot = p2_bottom_left)
p_rep

table(Dverbal_Replication$z_ds_group)
## 
##  RRB_over_SC SC_equal_RRB  SC_over_RRB 
##          137          629          124
cor_res = cor.test(Dverbal_Replication$dbaes_atotal, Dverbal_Replication$dbaes_btotal)
cor_res
## 
##  Pearson's product-moment correlation
## 
## data:  Dverbal_Replication$dbaes_atotal and Dverbal_Replication$dbaes_btotal
## t = 7.1459, df = 888, p-value = 1.864e-12
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1700824 0.2943934
## sample estimates:
##       cor 
## 0.2331903
#------------------------------------------------------------------------------
# Run supervised model with Discovery as training and Replication as Test

# run validation
# make subtypes using z-scores computed from the mean and sd of the training set
train_data = Dverbal_Discovery
test_data = Dverbal_Replication

mean2use = mean(train_data[,vars2use[1]] - train_data[,vars2use[2]])
sd2use = sd(train_data[,vars2use[1]] - train_data[,vars2use[2]])
tmp_train = make_subtype(data2use = train_data,
                         z_thresh = z_thresh,
                         mean2use = mean2use,
                         sd2use = sd2use)

mean2use = mean(test_data[,vars2use[1]] - test_data[,vars2use[2]])
sd2use = sd(test_data[,vars2use[1]] - test_data[,vars2use[2]])
tmp_test = make_subtype(data2use = test_data,
                        z_thresh = z_thresh,
                        mean2use = mean2use,
                        sd2use = sd2use)

#==============================================================================
#==============================================================================
# make labels based on train mean and sd
mean2use = mean(train_data[,vars2use[1]] - train_data[,vars2use[2]])
sd2use = sd(train_data[,vars2use[1]] - train_data[,vars2use[2]])
train_mean = mean2use
train_sd = sd2use

pred_labels = make_subtype(data2use = test_data,
                        z_thresh = z_thresh,
                        mean2use = train_mean,
                        sd2use = train_sd)
confmat = table(tmp_test$z_ds_group,pred_labels$z_ds_group)
acc = (confmat[1,1]+confmat[2,2]+confmat[3,3])/dim(pred_labels)[1]

# # compute model
# mod2use = svm(x = tmp_train[,vars2use], y = tmp_train$z_ds_group)
# pred_labels = predict(mod2use, tmp_test[,vars2use])
# confmat = table(tmp_test$z_ds_group,pred_labels)
# acc = (confmat[1,1]+confmat[2,2]+confmat[3,3])/length(pred_labels)
#==============================================================================
#==============================================================================

# plot confusion matrix
setHook("grid.newpage", function() pushViewport(viewport(x=1,y=1,width=0.9, height=0.9, name="vp", just=c("right","top"))), action="prepend")
pheatmap(confmat/rowSums(confmat)*100, display_numbers = confmat, color = colorRampPalette(c('white','red'))(100), cluster_rows = FALSE, cluster_cols = FALSE, fontsize_number = fontSize, fontsize_row = fontSize, fontsize_col = fontSize,labels_row = c("RRB>SC","SC=RRB","SC>RRB"),labels_col = c("RRB>SC","SC=RRB","SC>RRB"),angle_col=90,
         breaks= seq(0,100, length=100))
setHook("grid.newpage", NULL, "replace")
grid::grid.text("Actual Labels", y=-0.07, gp=gpar(fontsize=fontSize))
grid::grid.text("Predicted Labels", x=-0.07, rot=90, gp=gpar(fontsize=fontSize))

# show accuracy
true_accuracy = acc
true_accuracy
## [1] 0.9910112
#================================================================================
#================================================================================
# #------------------------------------------------------------------------------
# # Permute subtype labels to examine how well supervised model performs
#
# # nperm = 10000
#
# # make subtypes using z-scores computed from the mean and sd of the training set
# train_data = Dverbal_Discovery
# test_data = Dverbal_Replication
# mean2use = mean(train_data[,vars2use[1]] - train_data[,vars2use[2]])
# sd2use = sd(train_data[,vars2use[1]] - train_data[,vars2use[2]])
# tmp_train = make_subtype(data2use = train_data,
#                          z_thresh = z_thresh,
#                          mean2use = mean2use,
#                          sd2use = sd2use)
#
# mean2use = mean(test_data[,vars2use[1]] - test_data[,vars2use[2]])
# sd2use = sd(test_data[,vars2use[1]] - test_data[,vars2use[2]])
# tmp_test = make_subtype(data2use = test_data,
#                         z_thresh = z_thresh,
#                         mean2use = mean2use,
#                         sd2use = sd2use)
#
# acc = vector(length = nperm)
# for (iperm in 1:nperm){
#   # set seed for reproducibility
#   set.seed(iperm)
#
#   sc_perm = sample(train_data[,vars2use[1]])
#   rrb_perm = sample(train_data[,vars2use[2]])
#   perm_mean2use = mean(sc_perm - rrb_perm)
#   perm_sd2use = sd(sc_perm - rrb_perm)
#   # perm_mean2use = mean(train_data[,vars2use[1]] - rrb_perm)
#   # perm_sd2use = sd(train_data[,vars2use[1]] - rrb_perm)
#   pred_labels = make_subtype(data2use = tmp_test,
#                         z_thresh = z_thresh,
#                         mean2use = perm_mean2use,
#                         sd2use = perm_sd2use)
#   confmat = table(tmp_test$z_ds_group,pred_labels$z_ds_group)
#   acc = (confmat[1,1]+confmat[2,2]+confmat[3,3])/dim(pred_labels)[1]
#
#   # compute model
#   permuted_labels = sample(tmp_train$z_ds_group)
#   mod2use = svm(x = tmp_train[,vars2use], y = permuted_labels)
#   pred_labels = predict(mod2use, tmp_test[,vars2use])
#   confmat = table(tmp_test$z_ds_group,pred_labels)
#   acc[iperm] = (confmat[1,1]+confmat[2,2]+confmat[3,3])/length(pred_labels)
#   #============================================================================
# } # for (iperm in 1:nperm)
#
# df2plot = data.frame(Accuracy = acc)
# p = ggplot(data = df2plot, aes(x = Accuracy)) + geom_histogram() + geom_vline(xintercept=true_accuracy)
# p
#
# # compute p-value
# pval = sum(c(true_accuracy,acc)>=true_accuracy)/(nperm+1)
# pval
#================================================================================
#================================================================================

#------------------------------------------------------------------------------
# Plot difference score Z subtypes in Discovery set
maxScores = c(3,4)

# Discovery - make plot with all individuals shown as lines
df2use = Dverbal_Discovery[,c("subjectkey",adi_total_vars2use)]
df2use$subgrp = factor(tmp_train$z_ds_group)
df2use = data.frame(df2use)
df2use$subjectkey = factor(df2use$subjectkey)
df2use$SC = df2use$dbaes_atotal
df2use$RRB = df2use$dbaes_btotal

df4plot = melt(df2use,
               id.vars = c("subjectkey","subgrp"),
               measure.vars = c("SC","RRB"))

p = ggplot(data = df4plot, aes(x = variable,
                               y = value,
                               colour = subgrp,
                               group = subjectkey)) + facet_grid(. ~ subgrp)
p = p + geom_point(shape=1) + geom_line(alpha = 0.2) + ylim(0,1) + guides(color=FALSE)
p = p + ylab("Percent Severity") + xlab("ADI-R subscale")
p3_middle_top = p + guides(colour=FALSE)
ggsave(filename = file.path(plotpath, sprintf("final_NDAR_Disc_jitterplot_z%s.pdf",as.character(z_thresh))), plot = p3_middle_top)
p

# Plot difference score Z subtypes in Replication set
maxScores = c(3,4)

# Replication - make plot with all individuals shown as lines
df2use = Dverbal_Replication[,c("subjectkey",adi_total_vars2use)]
df2use$subgrp = factor(tmp_test$z_ds_group)
df2use = data.frame(df2use)
df2use$subjectkey = factor(df2use$subjectkey)
df2use$SC = df2use$dbaes_atotal
df2use$RRB = df2use$dbaes_btotal

df4plot = melt(df2use,
               id.vars = c("subjectkey","subgrp"),
               measure.vars = c("SC","RRB"))

p = ggplot(data = df4plot, aes(x = variable,
                               y = value,
                               colour = subgrp,
                               group = subjectkey)) + facet_grid(. ~ subgrp)
p = p + geom_point(shape=1) + geom_line(alpha = 0.2) + ylim(0,1) + guides(color=FALSE)
p = p + ylab("Percent Severity") + xlab("ADI-R subscale")
p4_middle_bottom = p + guides(colour=FALSE)
ggsave(filename = file.path(plotpath, sprintf("final_NDAR_Rep_jitterplot_z%s.pdf",as.character(z_thresh))), plot = p4_middle_bottom)
p

#------------------------------------------------------------------------------
# Apply NDAR subtypes to EU-AIMS LEAP data

# make SC and RRB percentages since EU-AIMS data is specified as percentages
Dverbal = read.csv(file.path(datapath,"tidy_verbal.csv"))
euaims_data = read.csv(file.path(datapath,"tidy_euaims.csv"))
mask1 = euaims_data$Diagnosis=="ASD"
mask2 =  (is.na(euaims_data$A1_pct_severity) | is.na(euaims_data$A2_pct_severity) | is.na(euaims_data$A3_pct_severity) | is.na(euaims_data$B1_pct_severity) | is.na(euaims_data$B2_pct_severity) | is.na(euaims_data$B3_pct_severity) | is.na(euaims_data$B4_pct_severity))
euaims_data = subset(euaims_data, (mask1 & !mask2))

Dverbal[,vars2use[1]] = Dverbal[,vars2use[1]]
Dverbal[,vars2use[2]] = Dverbal[,vars2use[2]]
euaims_data[,vars2use[1]] = (euaims_data$A1_pct_severity +
                               euaims_data$A2_pct_severity +
                               euaims_data$A3_pct_severity)/3
euaims_data[,vars2use[2]] = (euaims_data$B1_pct_severity +
                               euaims_data$B2_pct_severity +
                               euaims_data$B3_pct_severity +
                               euaims_data$B4_pct_severity)/4

train_data = Dverbal
test_data = euaims_data

mean2use = mean(train_data[,vars2use[1]] - train_data[,vars2use[2]], na.rm=TRUE)
sd2use = sd(train_data[,vars2use[1]] - train_data[,vars2use[2]], na.rm=TRUE)
c(mean2use, sd2use)
## [1] -0.01045243  0.19482749
tmp_train = make_subtype(data2use = train_data,
                         z_thresh = z_thresh,
                         mean2use = mean2use,
                         sd2use = sd2use)

tmp_test = make_subtype(data2use = test_data,
                        z_thresh = z_thresh,
                        mean2use = mean2use,
                        sd2use = sd2use)

#===========================================================================
#===========================================================================
# # compute model
# mod2use = svm(x = tmp_train[,vars2use], y = tmp_train$z_ds_group)
# pred_labels = predict(mod2use, tmp_test[,vars2use])
# confmat = table(tmp_test$z_ds_group,pred_labels)
# acc = (confmat[1,1]+confmat[2,2]+confmat[3,3])/length(pred_labels)
#
# tmp_test$svm_pred_labels = pred_labels
#
# # plot confusion matrix
# setHook("grid.newpage", function() pushViewport(viewport(x=1,y=1,width=0.9, height=0.9, name="vp", just=c("right","top"))), action="prepend")
# pheatmap(confmat/rowSums(confmat)*100, display_numbers = confmat, color = colorRampPalette(c('white','red'))(100), cluster_rows = FALSE, cluster_cols = FALSE, fontsize_number = fontSize, fontsize_row = fontSize, fontsize_col = fontSize,labels_row = c("RRB>SC","SC=RRB","SC>RRB"),labels_col = c("RRB>SC","SC=RRB","SC>RRB"),angle_col=90,
#          breaks= seq(0,100, length=100))
# setHook("grid.newpage", NULL, "replace")
# grid::grid.text("Actual Labels", y=-0.07, gp=gpar(fontsize=fontSize))
# grid::grid.text("Predicted Labels", x=-0.07, rot=90, gp=gpar(fontsize=fontSize))
#===========================================================================
#===========================================================================

# scatterplot
p1 = ggplot(data = tmp_train, aes(x = dbaes_atotal, y = dbaes_btotal, colour = factor(z_ds_group))) + geom_point() + xlab("Social-Communication") + ylab("Restricted Repetitive Behaviors") + ylim(0,1) + xlim(0,1) + ggtitle("NDAR ALL")
p1

p2 = ggplot(data = tmp_test, aes(x = dbaes_atotal, y = dbaes_btotal, colour = factor(z_ds_group))) + geom_point() + xlab("SC") + ylab("RRB") + ylim(0,1) + xlim(0,1) + ggtitle("EU-AIMS with Groups from NDAR ALL") +
  theme(text = element_text(size=fontSize),
        axis.text.x = element_text(size=fontSize),
        axis.text.y = element_text(size=fontSize))
p5_bottom_right = p2 + guides(colour=FALSE)
ggsave(filename = file.path(plotpath, sprintf("final_EUAIMS_scatterplot_z%s.pdf",as.character(z_thresh))), plot = p5_bottom_right)
p2

#===========================================================================
#===========================================================================
# p3 = ggplot(data = tmp_test, aes(x = dbaes_atotal, y = dbaes_btotal, colour = factor(pred_labels))) + geom_point() + xlab("SC") + ylab("RRB") + ylim(0,0.8) + xlim(0,0.8) + ggtitle("EU-AIMS")
# p5_bottom_right = p3 + guides(colour=FALSE)
# ggsave(filename = file.path(plotpath, sprintf("final_EUAIMS_scatterplot_z%s.pdf",as.character(z_thresh))), plot = p5_bottom_right)
# p3
#===========================================================================
#===========================================================================

# # write out EU-AIMS LEAP data with subgroups defined by NDAR All
write.csv(tmp_test, file.path(datapath, sprintf("tidy_euaims_NDAR_subtypes_diffscore_z%s.csv",as.character(z_thresh))))

#===========================================================================
#===========================================================================
# # Make final plot
# p_final = p1_top_left + p3_middle_top + plot_spacer() + p2_bottom_left + p4_middle_bottom + p5_bottom_right + plot_layout(nrow=3, ncol=3, widths = c(4,4,4), heights = c(8,8,8))
# ggsave(filename = file.path(plotpath, sprintf("final_NDAR_EUAIMS_subtypes_plot_z%s.pdf",as.character(z_thresh))), plot = p_final)
# p_final
#===========================================================================
#===========================================================================

#------------------------------------------------------------------------------
# Integrate subtypes with rest of EU-AIMS LEAP data
euaims_data = read.csv(file.path(datapath,"tidy_euaims.csv"))
td_df = subset(euaims_data, euaims_data$Diagnosis=="TD",select=2:6)
td_df$A_pct_severity = as.numeric(NA)
td_df$B_pct_severity = as.numeric(NA)
td_df$subgrp = "TD"

#===========================================================================
#===========================================================================
# cols2use = c("subid","age","Centre","Schedule","Diagnosis","dbaes_atotal","dbaes_btotal","svm_pred_labels")
cols2use = c("subid","age","Centre","Schedule","Diagnosis","dbaes_atotal","dbaes_btotal","z_ds_group")
#===========================================================================
#===========================================================================
tmp_asd = tmp_test[,cols2use]
colnames(tmp_asd)[6] = "A_pct_severity"
colnames(tmp_asd)[7] = "B_pct_severity"
colnames(tmp_asd)[8] = "subgrp"
asd_df = tmp_asd

all_data = rbind(td_df,asd_df)

fname = "/Users/mlombardo/Dropbox/euaims/data/rsfmri_preproc/euaims_preproc.xlsx"
pp_data = read_excel(fname)
mask = pp_data$notes=="ok"
pp_data = subset(pp_data, mask)

asd_df = merge(pp_data[,c("subid","sex")],asd_df, by = "subid")
td_df = merge(pp_data[,c("subid","sex")],td_df, by = "subid")

all_data = rbind(td_df,asd_df)

data2write = merge(pp_data, all_data, by = "subid")
data2write$age = data2write$age.x
data2write$sex = data2write$sex.y
print(table(data2write$Schedule, data2write$subgrp))
##    
##     RRB_over_SC SC_equal_RRB SC_over_RRB TD
##   A           4           47          29 78
##   B           0           54          31 83
##   C           3           39          21 59
##   D           0           20          18 23
print(table(data2write$Centre, data2write$subgrp))
##                
##                 RRB_over_SC SC_equal_RRB SC_over_RRB TD
##   CAMBRIDGE               2           33           7 29
##   KINGS_COLLEGE           4           59          43 78
##   MANNHEIM                0            0           0 34
##   NIJMEGEN                1           56          36 64
##   UTRECHT                 0           12          13 38
print(table(data2write$sex, data2write$subgrp))
##         
##          RRB_over_SC SC_equal_RRB SC_over_RRB  TD
##   Female           3           41          27  88
##   Male             4          119          72 155
#DSM-5 - find best split that balances participants across sites
a = findBestSplit(asd_df, seed_range = c(172342)) #,300001:500000))
## [1] 172342
best_seeds = a$seed[!is.na(a$discrepancy) & a$discrepancy==min(a$discrepancy, na.rm = TRUE)]
print(best_seeds)
## [1] 172342
# Split datasets -------------------------------------------------------------
rngSeed = best_seeds[1]

# split Schedule A dataset
dset2use = subset(asd_df, asd_df$Schedule=="A")
tmp_d = SplitDatasetsBySex(dset2use, rngSeed = rngSeed)
A_Discovery = tmp_d[[2]]
A_Replication = tmp_d[[1]]

# split Schedule B dataset
dset2use = subset(asd_df, asd_df$Schedule=="B")
tmp_d = SplitDatasetsBySex(dset2use, rngSeed = rngSeed)
B_Discovery = tmp_d[[2]]
B_Replication = tmp_d[[1]]

# split Schedule C dataset
dset2use = subset(asd_df, asd_df$Schedule=="C")
tmp_d = SplitDatasetsBySex(dset2use, rngSeed = rngSeed)
C_Discovery = tmp_d[[2]]
C_Replication = tmp_d[[1]]

# split Schedule D dataset
dset2use = subset(asd_df, asd_df$Schedule=="D")
tmp_d = SplitDatasetsBySex(dset2use, rngSeed = rngSeed)
D_Discovery = tmp_d[[2]]
D_Replication = tmp_d[[1]]

df_Disc = rbind(A_Discovery, B_Discovery, C_Discovery, D_Discovery)
df_Rep = rbind(A_Replication, B_Replication, C_Replication, D_Replication)

a = table(df_Disc$Schedule, df_Disc$Centre); print(a)
##    
##     CAMBRIDGE KINGS_COLLEGE NIJMEGEN UTRECHT
##   A         6            17       10       7
##   B         9            16       14       3
##   C         6             9       14       2
##   D         0            10       10       0
b = table(df_Rep$Schedule, df_Rep$Centre); print(b)
##    
##     CAMBRIDGE KINGS_COLLEGE NIJMEGEN UTRECHT
##   A         7            18       10       5
##   B         8            17       13       5
##   C         6            10       13       3
##   D         0             9        9       0
print(a-b)
##    
##     CAMBRIDGE KINGS_COLLEGE NIJMEGEN UTRECHT
##   A        -1            -1        0       2
##   B         1            -1        1      -2
##   C         0            -1        1      -1
##   D         0             1        1       0
print(sum(rowSums(abs(a-b))))
## [1] 14
data2write$dataset = NA
mask  = is.element(data2write$subid,df_Disc$subid)
data2write[mask,"dataset"] = "Discovery"
mask  = is.element(data2write$subid,df_Rep$subid)
data2write[mask,"dataset"] = "Replication"

asd_Disc = subset(data2write, data2write$dataset=="Discovery" & data2write$Diagnosis=="ASD")
asd_Rep = subset(data2write, data2write$dataset=="Replication" & data2write$Diagnosis=="ASD")

# # find which seed gives best TD age-match -------------------------------------
# seeds = 1:1000
# pvals = data.frame(matrix(nrow = length(seeds), ncol = 2))
# for (i in 1:length(seeds)) {
#   res = findTDAgeMatch(data2write, seed_range = c(seeds[i],seeds[i]))
#   td_Disc_matched = res[[2]]
#   td_Rep_matched = res[[1]]
#   tres = t.test(td_Disc_matched$age, asd_Disc$age)
#   pvals[i,1] = tres$p.value
#   tres = t.test(td_Rep_matched$age, asd_Rep$age)
#   pvals[i,2] = tres$p.value
#   #print(i)
# }
# a = sort.int(pvals[,1], decreasing = TRUE, index.return = TRUE)
# b=pvals[a$ix,]

seed2use = 929
res = findTDAgeMatch(data2write, seed_range = c(seed2use,seed2use))
td_Disc_matched = res[[2]]
td_Rep_matched = res[[1]]

mask = is.element(data2write$subid, td_Disc_matched$subid)
data2write$dataset[mask] = "Discovery"

mask = is.element(data2write$subid, td_Rep_matched$subid)
data2write$dataset[mask] = "Replication"

print(table(data2write$dataset, data2write$subgrp))
##              
##               RRB_over_SC SC_equal_RRB SC_over_RRB  TD
##   Discovery             6           77          50 121
##   Replication           1           83          49 122
fname2save = here(sprintf("asd_subgrp_data_rsfmri_ALL_DSM5_diffzscoreGrps_z%s.csv",as.character(z_thresh)))
write.csv(data2write,file = fname2save)


#------------------------------------------------------------------------------
# Descriptive stats
df2use = data2write[,c("subid","Diagnosis","dataset","Centre","meanFD","age","sex","subgrp","viq_all","piq_all","fsiq4_all","A_pct_severity","B_pct_severity","ADI_social_total","ADI_communication_total","ADI_RRB_total","ados_2_SA_CSS","ados_2_RRB_CSS","SRS_tscore","SRS_tscore_self","RBS_total","SSP_total","vabsdscoresc_dss","vabsdscoresd_dss","vabsdscoress_dss","vabsabcabc_standard")]
mask = df2use==999 | df2use==777
df2use[mask] = NA
df2use$age = df2use$age/365

cols2use = c("dataset","subgrp","age","meanFD","viq_all","piq_all","fsiq4_all","A_pct_severity","B_pct_severity","ADI_social_total","ADI_communication_total","ADI_RRB_total","ados_2_SA_CSS","ados_2_RRB_CSS","SRS_tscore","SRS_tscore_self","RBS_total","SSP_total","vabsdscoresc_dss","vabsdscoresd_dss","vabsdscoress_dss","vabsabcabc_standard")
res=describeBy(x = df2use[,cols2use], group = c("subgrp","dataset"))
res
## 
##  Descriptive statistics by group 
## subgrp: RRB_over_SC
## dataset: Discovery
##                         vars n   mean    sd median trimmed   mad    min    max
## dataset*                   1 6    NaN    NA     NA     NaN    NA    Inf   -Inf
## subgrp*                    2 6    NaN    NA     NA     NaN    NA    Inf   -Inf
## age                        3 6  18.08  7.13  21.77   18.08  2.50   7.89  23.88
## meanFD                     4 6   0.29  0.42   0.15    0.29  0.07   0.06   1.14
## viq_all                    5 6 102.64 16.25 103.00  102.64 13.34  78.00 123.85
## piq_all                    6 6 102.83 10.76 103.00  102.83 13.34  87.00 114.00
## fsiq4_all                  7 6 102.67 14.63 103.00  102.67 14.83  80.00 118.01
## A_pct_severity             8 6   0.18  0.12   0.12    0.18  0.06   0.05   0.35
## B_pct_severity             9 6   0.47  0.09   0.48    0.47  0.11   0.36   0.59
## ADI_social_total          10 6  15.83  8.33  16.50   15.83  9.64   5.00  26.00
## ADI_communication_total   11 6  14.67  6.31  14.50   14.67  7.41   6.00  23.00
## ADI_RRB_total             12 6   9.00  1.26   9.50    9.00  0.74   7.00  10.00
## ados_2_SA_CSS             13 6   3.83  3.37   2.50    3.83  2.22   1.00   9.00
## ados_2_RRB_CSS            14 6   4.83  4.22   4.50    4.83  5.19   1.00   9.00
## SRS_tscore                15 3  71.33 16.65  66.00   71.33 11.86  58.00  90.00
## SRS_tscore_self           16 4  59.00  7.12  61.00   59.00  4.45  49.00  65.00
## RBS_total                 17 3  17.33 10.69  15.00   17.33 10.38   8.00  29.00
## SSP_total                 18 2 143.00 33.94 143.00  143.00 35.58 119.00 167.00
## vabsdscoresc_dss          19 4  86.50 20.68  82.00   86.50 14.83  67.00 115.00
## vabsdscoresd_dss          20 4  71.00  9.63  74.00   71.00  3.71  57.00  79.00
## vabsdscoress_dss          21 4  70.75 10.34  72.00   70.75  8.15  57.00  82.00
## vabsabcabc_standard       22 4  74.00 12.03  74.50   74.00 11.86  59.00  88.00
##                         range  skew kurtosis    se
## dataset*                 -Inf    NA       NA    NA
## subgrp*                  -Inf    NA       NA    NA
## age                     16.00 -0.52    -1.90  2.91
## meanFD                   1.09  1.32    -0.16  0.17
## viq_all                 45.85 -0.18    -1.59  6.64
## piq_all                 27.00 -0.20    -1.80  4.39
## fsiq4_all               38.01 -0.31    -1.70  5.97
## A_pct_severity           0.30  0.42    -1.90  0.05
## B_pct_severity           0.23  0.05    -2.03  0.04
## ADI_social_total        21.00 -0.06    -2.00  3.40
## ADI_communication_total 17.00 -0.02    -1.77  2.58
## ADI_RRB_total            3.00 -0.49    -1.70  0.52
## ados_2_SA_CSS            8.00  0.51    -1.77  1.38
## ados_2_RRB_CSS           8.00  0.02    -2.29  1.72
## SRS_tscore              32.00  0.29    -2.33  9.61
## SRS_tscore_self         16.00 -0.50    -1.88  3.56
## RBS_total               21.00  0.21    -2.33  6.17
## SSP_total               48.00  0.00    -2.75 24.00
## vabsdscoresc_dss        48.00  0.42    -1.89 10.34
## vabsdscoresd_dss        22.00 -0.61    -1.76  4.81
## vabsdscoress_dss        25.00 -0.26    -1.87  5.17
## vabsabcabc_standard     29.00 -0.09    -1.94  6.01
## ------------------------------------------------------------ 
## subgrp: SC_equal_RRB
## dataset: Discovery
##                         vars  n   mean    sd median trimmed   mad   min    max
## dataset*                   1 77    NaN    NA     NA     NaN    NA   Inf   -Inf
## subgrp*                    2 77    NaN    NA     NA     NaN    NA   Inf   -Inf
## age                        3 77  16.29  5.72  14.98   16.09  5.98  7.08  30.28
## meanFD                     4 77   0.28  0.45   0.18    0.22  0.13  0.03   3.95
## viq_all                    5 76  98.00 19.20 101.00   98.01 17.71 61.00 140.91
## piq_all                    6 76  99.57 21.64 102.00   99.65 19.98 58.00 150.00
## fsiq4_all                  7 77  98.94 18.89 102.50   99.34 19.88 60.00 143.00
## A_pct_severity             8 77   0.31  0.14   0.31    0.31  0.13  0.00   0.63
## B_pct_severity             9 77   0.29  0.16   0.28    0.28  0.17  0.01   0.69
## ADI_social_total          10 77  16.83  6.81  16.00   17.13  8.90  2.00  27.00
## ADI_communication_total   11 77  14.00  5.87  14.00   14.08  5.93  0.00  26.00
## ADI_RRB_total             12 77   5.08  2.54   5.00    5.06  2.97  0.00  12.00
## ados_2_SA_CSS             13 76   6.22  2.60   6.50    6.31  2.97  1.00  10.00
## ados_2_RRB_CSS            14 76   4.86  2.83   5.00    4.82  2.97  1.00  10.00
## SRS_tscore                15 67  72.04 12.35  74.00   72.33 14.83 45.00  95.00
## SRS_tscore_self           16 35  63.11 12.85  64.00   62.14 13.34 43.00  94.00
## RBS_total                 17 64  18.83 16.40  17.00   16.42 14.08  0.00  90.00
## SSP_total                 18 44 135.86 31.26 138.50  136.92 30.39 53.00 189.00
## vabsdscoresc_dss          19 74  73.65 17.51  75.00   74.23 11.86 21.00 122.00
## vabsdscoresd_dss          20 73  73.70 16.61  73.00   73.47 11.86 25.00 131.00
## vabsdscoress_dss          21 74  70.78 15.69  73.00   71.93 13.34 20.00 104.00
## vabsabcabc_standard       22 73  71.27 13.20  72.00   71.47 10.38 20.00 103.00
##                          range  skew kurtosis   se
## dataset*                  -Inf    NA       NA   NA
## subgrp*                   -Inf    NA       NA   NA
## age                      23.20  0.38    -0.76 0.65
## meanFD                    3.92  6.92    52.84 0.05
## viq_all                  79.91 -0.08    -0.73 2.20
## piq_all                  92.00 -0.10    -0.56 2.48
## fsiq4_all                83.00 -0.22    -0.74 2.15
## A_pct_severity            0.63  0.07    -0.45 0.02
## B_pct_severity            0.68  0.42    -0.48 0.02
## ADI_social_total         25.00 -0.31    -1.01 0.78
## ADI_communication_total  26.00 -0.11    -0.73 0.67
## ADI_RRB_total            12.00  0.11    -0.51 0.29
## ados_2_SA_CSS             9.00 -0.32    -1.12 0.30
## ados_2_RRB_CSS            9.00 -0.24    -1.21 0.32
## SRS_tscore               50.00 -0.15    -0.95 1.51
## SRS_tscore_self          51.00  0.48    -0.39 2.17
## RBS_total                90.00  1.75     4.28 2.05
## SSP_total               136.00 -0.37    -0.39 4.71
## vabsdscoresc_dss        101.00 -0.41     1.24 2.04
## vabsdscoresd_dss        106.00  0.18     1.76 1.94
## vabsdscoress_dss         84.00 -0.74     0.69 1.82
## vabsabcabc_standard      83.00 -0.53     2.46 1.54
## ------------------------------------------------------------ 
## subgrp: SC_over_RRB
## dataset: Discovery
##                         vars  n   mean    sd median trimmed   mad   min    max
## dataset*                   1 50    NaN    NA     NA     NaN    NA   Inf   -Inf
## subgrp*                    2 50    NaN    NA     NA     NaN    NA   Inf   -Inf
## age                        3 50  16.26  5.03  15.88   15.96  4.52  7.78  29.40
## meanFD                     4 50   0.23  0.24   0.14    0.17  0.10  0.04   1.08
## viq_all                    5 49  96.93 17.95  98.00   96.50 17.25 65.55 142.00
## piq_all                    6 49  99.20 20.87 102.00  100.51 19.27 52.43 136.38
## fsiq4_all                  7 50  98.27 18.28 100.68   98.97 19.75 59.00 128.30
## A_pct_severity             8 50   0.46  0.13   0.46    0.46  0.11  0.19   0.82
## B_pct_severity             9 50   0.16  0.09   0.15    0.16  0.12  0.00   0.33
## ADI_social_total          10 50  18.52  6.44  19.50   19.07  6.67  3.00  28.00
## ADI_communication_total   11 50  14.54  5.03  15.00   14.75  5.19  2.00  24.00
## ADI_RRB_total             12 50   2.96  2.02   3.00    2.85  1.48  0.00   8.00
## ados_2_SA_CSS             13 48   6.31  2.54   7.00    6.42  2.97  1.00  10.00
## ados_2_RRB_CSS            14 48   4.71  2.79   5.50    4.60  2.22  1.00  10.00
## SRS_tscore                15 44  73.23 10.97  74.50   73.89  8.15 44.00  90.00
## SRS_tscore_self           16 21  61.71  9.86  61.00   60.94  8.90 42.00  89.00
## RBS_total                 17 43  14.72 13.27  12.00   13.11 13.34  0.00  54.00
## SSP_total                 18 33 138.64 28.33 139.00  139.15 37.06 78.00 183.00
## vabsdscoresc_dss          19 45  69.29 16.26  71.00   71.16 10.38 21.00  99.00
## vabsdscoresd_dss          20 45  68.31 16.18  66.00   68.68 11.86 17.00 112.00
## vabsdscoress_dss          21 45  66.24 15.96  68.00   68.08 14.83 20.00  95.00
## vabsabcabc_standard       22 45  65.71 15.45  68.00   67.62 10.38  6.00  91.00
##                          range  skew kurtosis   se
## dataset*                  -Inf    NA       NA   NA
## subgrp*                   -Inf    NA       NA   NA
## age                      21.62  0.53    -0.05 0.71
## meanFD                    1.04  2.20     4.21 0.03
## viq_all                  76.45  0.21    -0.40 2.56
## piq_all                  83.96 -0.57    -0.42 2.98
## fsiq4_all                69.30 -0.30    -0.92 2.59
## A_pct_severity            0.63  0.14     0.06 0.02
## B_pct_severity            0.33  0.10    -1.12 0.01
## ADI_social_total         25.00 -0.63    -0.45 0.91
## ADI_communication_total  22.00 -0.37    -0.39 0.71
## ADI_RRB_total             8.00  0.56    -0.31 0.29
## ados_2_SA_CSS             9.00 -0.46    -0.85 0.37
## ados_2_RRB_CSS            9.00 -0.15    -1.12 0.40
## SRS_tscore               46.00 -0.59     0.15 1.65
## SRS_tscore_self          47.00  0.70     0.93 2.15
## RBS_total                54.00  1.14     1.05 2.02
## SSP_total               105.00 -0.06    -1.14 4.93
## vabsdscoresc_dss         78.00 -1.26     2.12 2.42
## vabsdscoresd_dss         95.00 -0.37     1.76 2.41
## vabsdscoress_dss         75.00 -1.11     1.34 2.38
## vabsabcabc_standard      85.00 -1.83     4.55 2.30
## ------------------------------------------------------------ 
## subgrp: TD
## dataset: Discovery
##                         vars   n   mean    sd median trimmed   mad    min
## dataset*                   1 121    NaN    NA     NA     NaN    NA    Inf
## subgrp*                    2 121    NaN    NA     NA     NaN    NA    Inf
## age                        3 121  16.83  5.23  16.65   16.73  5.69   7.22
## meanFD                     4 121   0.18  0.15   0.13    0.15  0.07   0.03
## viq_all                    5 119 104.52 19.70 105.00  105.03 19.27  46.00
## piq_all                    6 119 106.10 19.47 107.00  107.53 17.79  49.00
## fsiq4_all                  7 119 105.72 18.33 108.18  106.99 16.58  53.00
## A_pct_severity             8   0    NaN    NA     NA     NaN    NA    Inf
## B_pct_severity             9   0    NaN    NA     NA     NaN    NA    Inf
## ADI_social_total          10   0    NaN    NA     NA     NaN    NA    Inf
## ADI_communication_total   11   0    NaN    NA     NA     NaN    NA    Inf
## ADI_RRB_total             12   0    NaN    NA     NA     NaN    NA    Inf
## ados_2_SA_CSS             13   0    NaN    NA     NA     NaN    NA    Inf
## ados_2_RRB_CSS            14   0    NaN    NA     NA     NaN    NA    Inf
## SRS_tscore                15  68  47.84  9.40  45.00   46.32  5.19  37.00
## SRS_tscore_self           16  71  46.69  4.85  46.00   46.26  4.45  39.00
## RBS_total                 17  68   2.15  4.74   0.00    0.95  0.00   0.00
## SSP_total                 18  59 177.86 12.71 182.00  179.78  8.90 122.00
## vabsdscoresc_dss          19  34  91.97 25.44  99.50   93.50 21.50  21.00
## vabsdscoresd_dss          20  34  90.74 20.28  98.50   92.25 17.05  33.00
## vabsdscoress_dss          21  34  96.21 23.67 102.50   98.57 21.50  33.00
## vabsabcabc_standard       22  34  92.06 23.04  99.50   93.89 15.57  25.00
##                            max  range  skew kurtosis   se
## dataset*                  -Inf   -Inf    NA       NA   NA
## subgrp*                   -Inf   -Inf    NA       NA   NA
## age                      29.84  22.62  0.17    -0.51 0.48
## meanFD                    0.85   0.82  2.28     5.65 0.01
## viq_all                 160.00 114.00 -0.24     0.38 1.81
## piq_all                 147.00  98.00 -0.69     0.32 1.79
## fsiq4_all               142.00  89.00 -0.69     0.58 1.68
## A_pct_severity            -Inf   -Inf    NA       NA   NA
## B_pct_severity            -Inf   -Inf    NA       NA   NA
## ADI_social_total          -Inf   -Inf    NA       NA   NA
## ADI_communication_total   -Inf   -Inf    NA       NA   NA
## ADI_RRB_total             -Inf   -Inf    NA       NA   NA
## ados_2_SA_CSS             -Inf   -Inf    NA       NA   NA
## ados_2_RRB_CSS            -Inf   -Inf    NA       NA   NA
## SRS_tscore               76.00  39.00  1.54     1.44 1.14
## SRS_tscore_self          63.00  24.00  0.93     1.10 0.58
## RBS_total                27.00  27.00  3.13    10.87 0.57
## SSP_total               190.00  68.00 -1.90     4.85 1.66
## vabsdscoresc_dss        138.00 117.00 -0.71     0.36 4.36
## vabsdscoresd_dss        121.00  88.00 -0.80     0.07 3.48
## vabsdscoress_dss        129.00  96.00 -0.83    -0.31 4.06
## vabsabcabc_standard     127.00 102.00 -0.90     0.30 3.95
## ------------------------------------------------------------ 
## subgrp: RRB_over_SC
## dataset: Replication
##                         vars n   mean sd median trimmed mad    min    max range
## dataset*                   1 1    NaN NA     NA     NaN  NA    Inf   -Inf  -Inf
## subgrp*                    2 1    NaN NA     NA     NaN  NA    Inf   -Inf  -Inf
## age                        3 1  11.45 NA  11.45   11.45   0  11.45  11.45     0
## meanFD                     4 1   0.38 NA   0.38    0.38   0   0.38   0.38     0
## viq_all                    5 1 143.00 NA 143.00  143.00   0 143.00 143.00     0
## piq_all                    6 1 148.00 NA 148.00  148.00   0 148.00 148.00     0
## fsiq4_all                  7 1 148.00 NA 148.00  148.00   0 148.00 148.00     0
## A_pct_severity             8 1   0.15 NA   0.15    0.15   0   0.15   0.15     0
## B_pct_severity             9 1   0.40 NA   0.40    0.40   0   0.40   0.40     0
## ADI_social_total          10 1  16.00 NA  16.00   16.00   0  16.00  16.00     0
## ADI_communication_total   11 1   6.00 NA   6.00    6.00   0   6.00   6.00     0
## ADI_RRB_total             12 1   7.00 NA   7.00    7.00   0   7.00   7.00     0
## ados_2_SA_CSS             13 1   5.00 NA   5.00    5.00   0   5.00   5.00     0
## ados_2_RRB_CSS            14 1   1.00 NA   1.00    1.00   0   1.00   1.00     0
## SRS_tscore                15 1  60.00 NA  60.00   60.00   0  60.00  60.00     0
## SRS_tscore_self           16 0    NaN NA     NA     NaN  NA    Inf   -Inf  -Inf
## RBS_total                 17 1  13.00 NA  13.00   13.00   0  13.00  13.00     0
## SSP_total                 18 1 153.00 NA 153.00  153.00   0 153.00 153.00     0
## vabsdscoresc_dss          19 1  99.00 NA  99.00   99.00   0  99.00  99.00     0
## vabsdscoresd_dss          20 1  74.00 NA  74.00   74.00   0  74.00  74.00     0
## vabsdscoress_dss          21 1  76.00 NA  76.00   76.00   0  76.00  76.00     0
## vabsabcabc_standard       22 1  81.00 NA  81.00   81.00   0  81.00  81.00     0
##                         skew kurtosis se
## dataset*                  NA       NA NA
## subgrp*                   NA       NA NA
## age                       NA       NA NA
## meanFD                    NA       NA NA
## viq_all                   NA       NA NA
## piq_all                   NA       NA NA
## fsiq4_all                 NA       NA NA
## A_pct_severity            NA       NA NA
## B_pct_severity            NA       NA NA
## ADI_social_total          NA       NA NA
## ADI_communication_total   NA       NA NA
## ADI_RRB_total             NA       NA NA
## ados_2_SA_CSS             NA       NA NA
## ados_2_RRB_CSS            NA       NA NA
## SRS_tscore                NA       NA NA
## SRS_tscore_self           NA       NA NA
## RBS_total                 NA       NA NA
## SSP_total                 NA       NA NA
## vabsdscoresc_dss          NA       NA NA
## vabsdscoresd_dss          NA       NA NA
## vabsdscoress_dss          NA       NA NA
## vabsabcabc_standard       NA       NA NA
## ------------------------------------------------------------ 
## subgrp: SC_equal_RRB
## dataset: Replication
##                         vars  n   mean    sd median trimmed   mad   min    max
## dataset*                   1 83    NaN    NA     NA     NaN    NA   Inf   -Inf
## subgrp*                    2 83    NaN    NA     NA     NaN    NA   Inf   -Inf
## age                        3 83  16.51  5.67  16.33   16.24  6.06  7.12  30.15
## meanFD                     4 83   0.25  0.27   0.18    0.19  0.10  0.05   1.60
## viq_all                    5 82 102.11 16.52 102.73  102.78 18.21 62.90 133.00
## piq_all                    6 82 103.83 18.60 106.00  105.21 19.27 52.00 134.00
## fsiq4_all                  7 82 103.28 16.55 106.38  104.15 17.05 64.00 131.00
## A_pct_severity             8 83   0.28  0.13   0.26    0.27  0.12  0.04   0.65
## B_pct_severity             9 83   0.23  0.13   0.23    0.23  0.13  0.00   0.67
## ADI_social_total          10 83  14.96  5.92  15.00   15.21  5.93  1.00  27.00
## ADI_communication_total   11 83  11.84  5.36  11.00   11.82  5.93  0.00  24.00
## ADI_RRB_total             12 83   3.94  2.29   4.00    3.87  2.97  0.00   9.00
## ados_2_SA_CSS             13 81   5.68  2.51   6.00    5.74  2.97  1.00  10.00
## ados_2_RRB_CSS            14 81   4.91  2.52   5.00    4.91  1.48  1.00  10.00
## SRS_tscore                15 76  67.13 11.61  68.00   67.06 13.34 43.00  90.00
## SRS_tscore_self           16 40  61.90  7.86  61.50   61.66  6.67 46.00  84.00
## RBS_total                 17 73  13.37 11.30  10.00   11.92 10.38  0.00  52.00
## SSP_total                 18 50 142.44 26.14 142.50  144.18 33.36 69.00 184.00
## vabsdscoresc_dss          19 76  82.01 13.81  80.00   81.24 12.60 50.00 122.00
## vabsdscoresd_dss          20 75  78.24 15.27  77.00   77.59 13.34 38.00 119.00
## vabsdscoress_dss          21 76  75.91 15.22  77.00   76.52 11.86 28.00 112.00
## vabsabcabc_standard       22 75  77.09 12.83  77.00   76.74  8.90 39.00 117.00
##                          range  skew kurtosis   se
## dataset*                  -Inf    NA       NA   NA
## subgrp*                   -Inf    NA       NA   NA
## age                      23.03  0.35    -0.56 0.62
## meanFD                    1.55  3.50    13.51 0.03
## viq_all                  70.10 -0.30    -0.77 1.82
## piq_all                  82.00 -0.63    -0.05 2.05
## fsiq4_all                67.00 -0.44    -0.55 1.83
## A_pct_severity            0.61  0.45     0.09 0.01
## B_pct_severity            0.67  0.60     0.45 0.01
## ADI_social_total         26.00 -0.31    -0.61 0.65
## ADI_communication_total  24.00  0.06    -0.62 0.59
## ADI_RRB_total             9.00  0.27    -0.49 0.25
## ados_2_SA_CSS             9.00 -0.22    -0.82 0.28
## ados_2_RRB_CSS            9.00 -0.40    -0.78 0.28
## SRS_tscore               47.00  0.03    -0.89 1.33
## SRS_tscore_self          38.00  0.42     0.53 1.24
## RBS_total                52.00  1.26     1.53 1.32
## SSP_total               115.00 -0.56    -0.40 3.70
## vabsdscoresc_dss         72.00  0.56     0.23 1.58
## vabsdscoresd_dss         81.00  0.31     0.29 1.76
## vabsdscoress_dss         84.00 -0.55     1.08 1.75
## vabsabcabc_standard      78.00  0.24     1.12 1.48
## ------------------------------------------------------------ 
## subgrp: SC_over_RRB
## dataset: Replication
##                         vars  n   mean    sd median trimmed   mad   min    max
## dataset*                   1 49    NaN    NA     NA     NaN    NA   Inf   -Inf
## subgrp*                    2 49    NaN    NA     NA     NaN    NA   Inf   -Inf
## age                        3 49  16.30  5.21  15.78   16.09  5.99  8.29  29.23
## meanFD                     4 49   0.23  0.21   0.16    0.20  0.11  0.04   1.31
## viq_all                    5 46  93.12 19.32  96.64   94.25 17.93 50.91 127.00
## piq_all                    6 48  96.22 21.60  99.50   97.14 20.71 44.03 138.00
## fsiq4_all                  7 47  95.43 20.11 101.36   95.83 19.81 59.00 139.00
## A_pct_severity             8 49   0.51  0.13   0.52    0.51  0.14  0.27   0.75
## B_pct_severity             9 49   0.20  0.12   0.21    0.19  0.13  0.00   0.47
## ADI_social_total          10 49  19.39  5.87  20.00   19.71  5.93  6.00  29.00
## ADI_communication_total   11 49  15.61  4.64  16.00   15.83  4.45  4.00  24.00
## ADI_RRB_total             12 49   3.88  2.54   3.00    3.76  2.97  0.00  10.00
## ados_2_SA_CSS             13 46   6.20  2.98   6.00    6.29  4.45  1.00  10.00
## ados_2_RRB_CSS            14 46   4.50  2.83   5.00    4.39  2.97  1.00   9.00
## SRS_tscore                15 41  76.61 10.61  80.00   77.48 13.34 51.00  90.00
## SRS_tscore_self           16 23  62.65 10.72  62.00   62.63 10.38 40.00  84.00
## RBS_total                 17 41  21.59 15.81  18.00   19.94 11.86  1.00  73.00
## SSP_total                 18 32 134.62 24.70 138.50  134.31 25.20 91.00 181.00
## vabsdscoresc_dss          19 44  69.50 14.57  69.00   70.47 11.86 21.00 100.00
## vabsdscoresd_dss          20 44  68.27 15.83  66.50   67.69 12.60 42.00 118.00
## vabsdscoress_dss          21 44  63.43 15.88  63.50   63.53 14.83 23.00 100.00
## vabsabcabc_standard       22 44  65.25 13.82  65.00   65.61  8.90 28.00  94.00
##                         range  skew kurtosis   se
## dataset*                 -Inf    NA       NA   NA
## subgrp*                  -Inf    NA       NA   NA
## age                     20.94  0.43    -0.77 0.74
## meanFD                   1.26  2.93    11.01 0.03
## viq_all                 76.09 -0.54    -0.59 2.85
## piq_all                 93.97 -0.42    -0.63 3.12
## fsiq4_all               80.00 -0.25    -0.85 2.93
## A_pct_severity           0.48 -0.14    -0.93 0.02
## B_pct_severity           0.47  0.30    -0.77 0.02
## ADI_social_total        23.00 -0.56    -0.53 0.84
## ADI_communication_total 20.00 -0.47    -0.51 0.66
## ADI_RRB_total           10.00  0.46    -0.72 0.36
## ados_2_SA_CSS            9.00 -0.15    -1.39 0.44
## ados_2_RRB_CSS           8.00 -0.09    -1.36 0.42
## SRS_tscore              39.00 -0.56    -0.71 1.66
## SRS_tscore_self         44.00  0.02    -0.48 2.23
## RBS_total               72.00  1.06     0.87 2.47
## SSP_total               90.00 -0.09    -0.89 4.37
## vabsdscoresc_dss        79.00 -1.01     2.18 2.20
## vabsdscoresd_dss        76.00  0.65     0.49 2.39
## vabsdscoress_dss        77.00 -0.11    -0.03 2.39
## vabsabcabc_standard     66.00 -0.36     0.79 2.08
## ------------------------------------------------------------ 
## subgrp: TD
## dataset: Replication
##                         vars   n   mean    sd median trimmed   mad   min    max
## dataset*                   1 122    NaN    NA     NA     NaN    NA   Inf   -Inf
## subgrp*                    2 122    NaN    NA     NA     NaN    NA   Inf   -Inf
## age                        3 122  16.86  6.07  16.34   16.58  7.59  6.89  29.72
## meanFD                     4 122   0.23  0.46   0.14    0.15  0.07  0.04   4.60
## viq_all                    5 122 104.02 17.58 108.18  105.64 12.13 45.00 140.00
## piq_all                    6 122 104.64 18.41 108.96  106.56 14.08 49.00 139.00
## fsiq4_all                  7 122 104.94 17.14 108.09  107.29 11.86 50.00 134.00
## A_pct_severity             8   0    NaN    NA     NA     NaN    NA   Inf   -Inf
## B_pct_severity             9   0    NaN    NA     NA     NaN    NA   Inf   -Inf
## ADI_social_total          10   0    NaN    NA     NA     NaN    NA   Inf   -Inf
## ADI_communication_total   11   0    NaN    NA     NA     NaN    NA   Inf   -Inf
## ADI_RRB_total             12   0    NaN    NA     NA     NaN    NA   Inf   -Inf
## ados_2_SA_CSS             13   0    NaN    NA     NA     NaN    NA   Inf   -Inf
## ados_2_RRB_CSS            14   0    NaN    NA     NA     NaN    NA   Inf   -Inf
## SRS_tscore                15  65  47.23  9.34  44.00   45.66  4.45 37.00  90.00
## SRS_tscore_self           16  61  48.44  6.84  47.00   47.63  5.93 39.00  69.00
## RBS_total                 17  63   3.08 11.54   0.00    0.86  0.00  0.00  89.00
## SSP_total                 18  54 174.93 19.38 182.00  178.41  6.67 75.00 190.00
## vabsdscoresc_dss          19  39  92.74 25.45  96.00   95.82 20.76 21.00 125.00
## vabsdscoresd_dss          20  39  91.10 22.65  97.00   93.91 14.83 27.00 122.00
## vabsdscoress_dss          21  39  98.90 27.04 103.00  102.12 17.79 20.00 132.00
## vabsabcabc_standard       22  38  93.00 25.39 100.00   96.44 15.57 20.00 126.00
##                          range  skew kurtosis   se
## dataset*                  -Inf    NA       NA   NA
## subgrp*                   -Inf    NA       NA   NA
## age                      22.83  0.33    -0.97 0.55
## meanFD                    4.56  7.54    64.83 0.04
## viq_all                  95.00 -0.99     1.51 1.59
## piq_all                  90.00 -0.93     0.67 1.67
## fsiq4_all                84.00 -1.28     1.67 1.55
## A_pct_severity            -Inf    NA       NA   NA
## B_pct_severity            -Inf    NA       NA   NA
## ADI_social_total          -Inf    NA       NA   NA
## ADI_communication_total   -Inf    NA       NA   NA
## ADI_RRB_total             -Inf    NA       NA   NA
## ados_2_SA_CSS             -Inf    NA       NA   NA
## ados_2_RRB_CSS            -Inf    NA       NA   NA
## SRS_tscore               53.00  2.14     5.82 1.16
## SRS_tscore_self          30.00  1.05     0.48 0.88
## RBS_total                89.00  6.60    45.89 1.45
## SSP_total               115.00 -2.88    10.92 2.64
## vabsdscoresc_dss        104.00 -1.21     1.00 4.08
## vabsdscoresd_dss         95.00 -1.34     1.26 3.63
## vabsdscoress_dss        112.00 -1.26     1.10 4.33
## vabsabcabc_standard     106.00 -1.42     1.40 4.12
#------------------------------------------------------------------------------
# Table of subtypes X sex

# Discovery
data2use = subset(data2write,data2write$dataset=="Discovery")
table(data2use$subgrp,data2use$sex)
##               
##                Female Male
##   RRB_over_SC       3    3
##   SC_equal_RRB     18   59
##   SC_over_RRB      14   36
##   TD               41   80
cs_res = chisq.test(data2use$subgrp,data2use$sex)
cs_res
## 
##  Pearson's Chi-squared test
## 
## data:  data2use$subgrp and data2use$sex
## X-squared = 3.7208, df = 3, p-value = 0.2932
# Replication
data2use = subset(data2write,data2write$dataset=="Replication")
table(data2use$subgrp,data2use$sex)
##               
##                Female Male
##   RRB_over_SC       0    1
##   SC_equal_RRB     23   60
##   SC_over_RRB      13   36
##   TD               47   75
cs_res = chisq.test(data2use$subgrp,data2use$sex)
cs_res
## 
##  Pearson's Chi-squared test
## 
## data:  data2use$subgrp and data2use$sex
## X-squared = 4.1601, df = 3, p-value = 0.2447
#------------------------------------------------------------------------------
vars2analyze = c("age","meanFD","viq_all","piq_all","fsiq4_all",
                 "A_pct_severity","B_pct_severity",
                 "ADI_social_total","ADI_communication_total","ADI_RRB_total",
                 "ados_2_SA_CSS","ados_2_RRB_CSS",
                 "SRS_tscore_self","RBS_total","SSP_total",
                 "vabsdscoress_dss","vabsdscoresd_dss","vabsdscoresc_dss","vabsabcabc_standard")

vnames = c("Age","Mean FD","VIQ","PIQ","FIQ","ADI-R SC","ADI-R RRB","ADI-R Social","ADI-R Communication","ADI-R RRB","ADOS SA CSS","ADOS RRB CSS","SRS","RBS","SSP","Vineland Socialization","Vineland Daily Living Skills","Vineland Communication","Vineland ABC")

cnames = c("All_Disc.fstat","All_Disc.pval","All_Rep.fstat","All_Rep.pval",
           "SCequalRRB_vs_SCoverRRB_Disc.fstat","SCequalRRB_vs_SCoverRRB_Disc.tstat",
           "SCequalRRB_vs_SCoverRRB_Disc.pval","SCequalRRB_vs_SCoverRRB_Disc.es",
           "SCequalRRB_vs_SCoverRRB_Rep.fstat","SCequalRRB_vs_SCoverRRB_Rep.tstat",
           "SCequalRRB_vs_SCoverRRB_Rep.pval","SCequalRRB_vs_SCoverRRB_Rep.es",
           "SCequalRRB_vs_SCoverRRB.repBF")

output_res = data.frame(matrix(nrow = length(vars2analyze),ncol = length(cnames)))
colnames(output_res) = cnames
rownames(output_res) = vars2analyze
output_res$varNames = vars2analyze

for (ivar in 1:length(vars2analyze)){

  y_var = vars2analyze[ivar]
  # print(y_var)
  #----------------------------------------------------------------------------
  # Discovery
  df4mod = subset(df2use, df2use$dataset=="Discovery")

  # construct linear model
  # mixed-effect model: site as random factor, all other covariates as fixed factors
  fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
  rx_form = as.formula(sprintf("~ 1|%s","Centre"))
  mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))

  # run ANOVA
  res = anova(mod2use)
  output_res[y_var,"All_Disc.fstat"] = res["subgrp","F-value"]
  output_res[y_var,"All_Disc.pval"] = res["subgrp","p-value"]

  #----------------------------------------------------------------------------
  # Replication
  df4mod = subset(df2use, df2use$dataset=="Replication")

  # construct linear model
  # mixed-effect model: site as random factor, all other covariates as fixed factors
  fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
  rx_form = as.formula(sprintf("~ 1|%s","Centre"))
  mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))

  # run ANOVA
  res = anova(mod2use)
  output_res[y_var,"All_Rep.fstat"] = res["subgrp","F-value"]
  output_res[y_var,"All_Rep.pval"] = res["subgrp","p-value"]

  #----------------------------------------------------------------------------
  # Discovery
  df4mod = subset(df2use, df2use$dataset=="Discovery" & !is.element(df2use$subgrp,c("TD","RRB_over_SC")))
  n1 = sum(df4mod$subgrp=="SC_equal_RRB")
  m1 = sum(df4mod$subgrp=="SC_over_RRB")

  # construct linear model
  # mixed-effect model: site as random factor, all other covariates as fixed factors
  fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
  rx_form = as.formula(sprintf("~ 1|%s","Centre"))
  mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))

  # run ANOVA
  res = anova(mod2use)
  output_res[y_var,"SCequalRRB_vs_SCoverRRB_Disc.fstat"] = res["subgrp","F-value"]
  output_res[y_var,"SCequalRRB_vs_SCoverRRB_Disc.pval"] = res["subgrp","p-value"]
  res = summary(mod2use)
  output_res[y_var,"SCequalRRB_vs_SCoverRRB_Disc.tstat"] = res$tTable[2,"t-value"]
  output_res[y_var,"SCequalRRB_vs_SCoverRRB_Disc.es"] = cohens_d(df4mod[df4mod$subgrp=="SC_equal_RRB",y_var],
                                                                 df4mod[df4mod$subgrp=="SC_over_RRB",y_var])

  #----------------------------------------------------------------------------
  # Replication
  df4mod = subset(df2use, df2use$dataset=="Replication" & !is.element(df2use$subgrp,c("TD","RRB_over_SC")))
  n2 = sum(df4mod$subgrp=="SC_equal_RRB")
  m2 = sum(df4mod$subgrp=="SC_over_RRB")

  # construct linear model
  # mixed-effect model: site as random factor, all other covariates as fixed factors
  fx_form = as.formula(sprintf("%s ~ %s",y_var,"subgrp"))
  rx_form = as.formula(sprintf("~ 1|%s","Centre"))
  mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))

  # run ANOVA
  res = anova(mod2use)
  output_res[y_var,"SCequalRRB_vs_SCoverRRB_Rep.fstat"] = res["subgrp","F-value"]
  output_res[y_var,"SCequalRRB_vs_SCoverRRB_Rep.pval"] = res["subgrp","p-value"]
  res = summary(mod2use)
  output_res[y_var,"SCequalRRB_vs_SCoverRRB_Rep.tstat"] = res$tTable[2,"t-value"]
  output_res[y_var,"SCequalRRB_vs_SCoverRRB_Rep.es"] = cohens_d(df4mod[df4mod$subgrp=="SC_equal_RRB",y_var],
                                                                 df4mod[df4mod$subgrp=="SC_over_RRB",y_var])

  #----------------------------------------------------------------------------
  # Replication Bayes Factor
  res_bf = BFSALL(tobs = output_res[y_var,"SCequalRRB_vs_SCoverRRB_Disc.tstat"],
                  trep = output_res[y_var,"SCequalRRB_vs_SCoverRRB_Disc.tstat"],
                  n1 = n1,
                  n2 = n2,
                  m1 = m1,
                  m2 = m2,
                  sample = 2,
                  Type = 'ALL')
  output_res[y_var,"SCequalRRB_vs_SCoverRRB.repBF"] = res_bf["Replication BF","Replication 1"]

  # make a plot
  colors2use = get_ggColorHue(3)
  df4plot = subset(df2use, df2use$subgrp!="RRB_over_SC")
  p = ggplot(data = df4plot, aes_string(x = "subgrp", y = y_var, colour = "subgrp")) + facet_grid(. ~ dataset)
  p = p + geom_jitter() + geom_boxplot(fill = NA, colour = "#000000", outlier.shape = NA)
  p = p + ylab(vnames[ivar]) + xlab("Group") +
    scale_x_discrete(labels=c("SC_equal_RRB"="SC=RRB","SC_over_RRB"="SC>RRB","TD"="TD")) +
                       scale_colour_manual(values = c(colors2use[2:3],"grey60")) + guides(colour=FALSE) +
    theme(text = element_text(size=fontSize-5),
        axis.text.x = element_text(size=fontSize-5),
        axis.text.y = element_text(size=fontSize-5))
  print(p)

}

vabc["1","Discovery"] = output_res["vabsabcabc_standard","SCequalRRB_vs_SCoverRRB_Disc.es"]
vabc["1","Replication"] = output_res["vabsabcabc_standard","SCequalRRB_vs_SCoverRRB_Rep.es"]
vabc_dls["1","Discovery"] = output_res["vabsdscoresd_dss","SCequalRRB_vs_SCoverRRB_Disc.es"]
vabc_dls["1","Replication"] = output_res["vabsdscoresd_dss","SCequalRRB_vs_SCoverRRB_Rep.es"]
output_res
##                         All_Disc.fstat All_Disc.pval All_Rep.fstat All_Rep.pval
## age                          0.3409502  7.957351e-01     0.6350975 5.930333e-01
## meanFD                       2.1793571  9.102259e-02     0.1076975 9.555433e-01
## viq_all                      2.2881571  7.915093e-02     5.1067969 1.915639e-03
## piq_all                      1.6155392  1.863606e-01     2.6993027 4.636135e-02
## fsiq4_all                    2.3403632  7.396269e-02     3.9938050 8.405111e-03
## A_pct_severity              29.1910165  3.708867e-11    54.6027167 0.000000e+00
## B_pct_severity              24.8789326  7.638108e-10     1.9898230 1.409594e-01
## ADI_social_total             2.4614717  8.937070e-02     9.7525012 1.147904e-04
## ADI_communication_total      0.6602227  5.185006e-01    10.7400003 4.904794e-05
## ADI_RRB_total               24.9062961  7.489428e-10     0.6930022 5.019534e-01
## ados_2_SA_CSS                2.4392023  9.140608e-02     0.8423143 4.332017e-01
## ados_2_RRB_CSS               0.1235088  8.839224e-01     1.6540440 1.955349e-01
## SRS_tscore_self             36.4235907  0.000000e+00    48.1766540 5.551115e-16
## RBS_total                   19.8557033  4.405487e-11    16.9467104 1.121701e-09
## SSP_total                   30.5392542  5.218048e-15    24.2225110 1.689981e-12
## vabsdscoress_dss            23.3455960  1.858957e-12    24.2020517 7.137624e-13
## vabsdscoresd_dss            12.6970004  1.951534e-07    10.6333916 2.184428e-06
## vabsdscoresc_dss            10.3252985  3.216407e-06    11.4872327 7.792022e-07
## vabsabcabc_standard         19.0024720  1.726517e-10    17.3725254 9.638763e-10
##                         SCequalRRB_vs_SCoverRRB_Disc.fstat
## age                                            0.002926778
## meanFD                                         0.561915762
## viq_all                                        0.144543558
## piq_all                                        0.005605256
## fsiq4_all                                      0.038733646
## A_pct_severity                                41.199213972
## B_pct_severity                                33.793235961
## ADI_social_total                               2.723119508
## ADI_communication_total                        0.941846172
## ADI_RRB_total                                 26.725139105
## ados_2_SA_CSS                                  0.060358486
## ados_2_RRB_CSS                                 0.206277233
## SRS_tscore_self                                0.014011628
## RBS_total                                      1.930634341
## SSP_total                                      0.679098271
## vabsdscoress_dss                               2.829696399
## vabsdscoresd_dss                               3.318283400
## vabsdscoresc_dss                               1.832087369
## vabsabcabc_standard                            4.440678576
##                         SCequalRRB_vs_SCoverRRB_Disc.tstat
## age                                            -0.05409971
## meanFD                                         -0.74961041
## viq_all                                        -0.38018885
## piq_all                                        -0.07486826
## fsiq4_all                                      -0.19680865
## A_pct_severity                                  6.41866138
## B_pct_severity                                 -5.81319499
## ADI_social_total                                1.65018772
## ADI_communication_total                         0.97048759
## ADI_RRB_total                                  -5.16963626
## ados_2_SA_CSS                                   0.24567964
## ados_2_RRB_CSS                                 -0.45417754
## SRS_tscore_self                                 0.11837072
## RBS_total                                      -1.38947268
## SSP_total                                       0.82407419
## vabsdscoress_dss                               -1.68217015
## vabsdscoresd_dss                               -1.82161560
## vabsdscoresc_dss                               -1.35354622
## vabsabcabc_standard                            -2.10729176
##                         SCequalRRB_vs_SCoverRRB_Disc.pval
## age                                          9.569442e-01
## meanFD                                       4.549324e-01
## viq_all                                      7.044773e-01
## piq_all                                      9.404442e-01
## fsiq4_all                                    8.443048e-01
## A_pct_severity                               2.748584e-09
## B_pct_severity                               5.016122e-08
## ADI_social_total                             1.014768e-01
## ADI_communication_total                      3.337236e-01
## ADI_RRB_total                                9.297268e-07
## ados_2_SA_CSS                                8.063535e-01
## ados_2_RRB_CSS                               6.505284e-01
## SRS_tscore_self                              9.062389e-01
## RBS_total                                    1.677147e-01
## SSP_total                                    4.126190e-01
## vabsdscoress_dss                             9.527311e-02
## vabsdscoresd_dss                             7.115856e-02
## vabsdscoresc_dss                             1.785588e-01
## vabsabcabc_standard                          3.730405e-02
##                         SCequalRRB_vs_SCoverRRB_Disc.es
## age                                         0.005202566
## meanFD                                      0.136146695
## viq_all                                     0.057261993
## piq_all                                     0.017377531
## fsiq4_all                                   0.035745032
## A_pct_severity                             -1.068087853
## B_pct_severity                              0.958629595
## ADI_social_total                           -0.253239685
## ADI_communication_total                    -0.097220221
## ADI_RRB_total                               0.900500211
## ados_2_SA_CSS                              -0.034527370
## ados_2_RRB_CSS                              0.052238597
## SRS_tscore_self                             0.118932130
## RBS_total                                   0.269321123
## SSP_total                                  -0.091977701
## vabsdscoress_dss                            0.287326559
## vabsdscoresd_dss                            0.327620942
## vabsdscoresc_dss                            0.255933511
## vabsabcabc_standard                         0.393924828
##                         SCequalRRB_vs_SCoverRRB_Rep.fstat
## age                                            0.04586012
## meanFD                                         0.08589090
## viq_all                                        6.92792527
## piq_all                                        3.25026109
## fsiq4_all                                      4.54321880
## A_pct_severity                               107.43254137
## B_pct_severity                                 2.41260784
## ADI_social_total                              19.47713747
## ADI_communication_total                       19.29022585
## ADI_RRB_total                                  0.00133106
## ados_2_SA_CSS                                  1.48802963
## ados_2_RRB_CSS                                 1.61507709
## SRS_tscore_self                                0.10212748
## RBS_total                                     12.37692724
## SSP_total                                      2.33246365
## vabsdscoress_dss                              18.79391008
## vabsdscoresd_dss                              12.26257294
## vabsdscoresc_dss                              21.97124650
## vabsabcabc_standard                           24.02962953
##                         SCequalRRB_vs_SCoverRRB_Rep.tstat
## age                                           -0.21414977
## meanFD                                        -0.29307149
## viq_all                                       -2.63209522
## piq_all                                       -1.80284805
## fsiq4_all                                     -2.13148277
## A_pct_severity                                10.36496702
## B_pct_severity                                -1.55325717
## ADI_social_total                               4.41329100
## ADI_communication_total                        4.39206396
## ADI_RRB_total                                 -0.03648369
## ados_2_SA_CSS                                  1.21984820
## ados_2_RRB_CSS                                -1.27085683
## SRS_tscore_self                                0.31957391
## RBS_total                                      3.51808574
## SSP_total                                     -1.52724054
## vabsdscoress_dss                              -4.33519435
## vabsdscoresd_dss                              -3.50179567
## vabsdscoresc_dss                              -4.68734962
## vabsabcabc_standard                           -4.90200260
##                         SCequalRRB_vs_SCoverRRB_Rep.pval
## age                                         8.307737e-01
## meanFD                                      7.699458e-01
## viq_all                                     9.573231e-03
## piq_all                                     7.382127e-02
## fsiq4_all                                   3.502273e-02
## A_pct_severity                              0.000000e+00
## B_pct_severity                              1.228502e-01
## ADI_social_total                            2.151451e-05
## ADI_communication_total                     2.342006e-05
## ADI_RRB_total                               9.709540e-01
## ados_2_SA_CSS                               2.248753e-01
## ados_2_RRB_CSS                              2.061975e-01
## SRS_tscore_self                             7.504396e-01
## RBS_total                                   6.351267e-04
## SSP_total                                   1.307987e-01
## vabsdscoress_dss                            3.137227e-05
## vabsdscoresd_dss                            6.608324e-04
## vabsdscoresc_dss                            7.661567e-06
## vabsabcabc_standard                         3.169900e-06
##                         SCequalRRB_vs_SCoverRRB_Rep.es
## age                                         0.03858047
## meanFD                                      0.05279873
## viq_all                                     0.51039535
## piq_all                                     0.38510505
## fsiq4_all                                   0.43724080
## A_pct_severity                             -1.79780755
## B_pct_severity                              0.29551079
## ADI_social_total                           -0.74987778
## ADI_communication_total                    -0.73819834
## ADI_RRB_total                               0.02607430
## ados_2_SA_CSS                              -0.19191587
## ados_2_RRB_CSS                              0.15678698
## SRS_tscore_self                            -0.08339268
## RBS_total                                  -0.62512296
## SSP_total                                   0.30503384
## vabsdscoress_dss                            0.80642663
## vabsdscoresd_dss                            0.64402872
## vabsdscoresc_dss                            0.88770839
## vabsabcabc_standard                         0.89696314
##                         SCequalRRB_vs_SCoverRRB.repBF                varNames
## age                                      7.056294e-01                     age
## meanFD                                   9.326246e-01                  meanFD
## viq_all                                  7.561628e-01                 viq_all
## piq_all                                  7.063480e-01                 piq_all
## fsiq4_all                                7.180131e-01               fsiq4_all
## A_pct_severity                           4.435845e+07          A_pct_severity
## B_pct_severity                           2.470910e+06          B_pct_severity
## ADI_social_total                         2.727642e+00        ADI_social_total
## ADI_communication_total                  1.128619e+00 ADI_communication_total
## ADI_RRB_total                            1.390222e+05           ADI_RRB_total
## ados_2_SA_CSS                            7.268164e-01           ados_2_SA_CSS
## ados_2_RRB_CSS                           7.806136e-01          ados_2_RRB_CSS
## SRS_tscore_self                          7.090095e-01         SRS_tscore_self
## RBS_total                                1.840873e+00               RBS_total
## SSP_total                                9.902609e-01               SSP_total
## vabsdscoress_dss                         2.864643e+00        vabsdscoress_dss
## vabsdscoresd_dss                         3.644820e+00        vabsdscoresd_dss
## vabsdscoresc_dss                         1.753322e+00        vabsdscoresc_dss
## vabsabcabc_standard                      6.296295e+00     vabsabcabc_standard
# # plot Vineland ABC effect sizes over thresholds
# tmp = data.frame(vabc)
# tmp$threshold = factor(rownames(tmp))
#
# df2plot = melt(tmp)
# p = ggplot(data = df2plot, aes(x = threshold, y =value, color=variable, group=variable)) + facet_grid(. ~ variable)
# p = p + geom_line(size=4) +
#   geom_point(size=7) +
#   ylab("Cohen's d") +
#   xlab("Z-threshold") +
#   ylim(0,1) +
#   guides(color=FALSE) +
#   theme(text = element_text(size=fontSize),
#         axis.text.x = element_text(size=fontSize),
#         axis.text.y = element_text(size=fontSize))
# p

Vineland ABC

# plot Vineland ABC effect sizes over thresholds
tmp = data.frame(vabc)
tmp$threshold = factor(rownames(tmp))

df2plot = melt(tmp)
p = ggplot(data = df2plot, aes(x = threshold, y =value, color=variable, group=variable)) + facet_grid(. ~ variable)
p = p + geom_line(size=4) +
  geom_point(size=7) +
  ylab("Cohen's d") +
  xlab("Z-threshold") +
  ylim(0,1) + scale_colour_manual(values = c("dodger blue","#ff8d1e")) +
  guides(color=FALSE) +
  theme(text = element_text(size=fontSize),
        axis.text.x = element_text(size=fontSize),
        axis.text.y = element_text(size=fontSize))
p

#------------------------------------------------------------------------------
# model effect of z difference score on Vineland ABC
y_var = "vabsabcabc_standard"

# Discovery
df4mod = subset(data2write, data2write$dataset=="Discovery")
df4mod = merge(df4mod,tmp_test[,c("subid","z_ds")],by.x="subid",by.y="subid")
df4mod[df4mod[,y_var]==999,y_var] = NA
# df4mod[df4mod[,y_var]==777,y_var] = NA
n_disc = sum(!is.na(df4mod[,y_var]))

# construct linear model
# mixed-effect model: site as random factor, all other covariates as fixed factors
fx_form = as.formula(sprintf("%s ~ %s",y_var,"z_ds"))
rx_form = as.formula(sprintf("~ 1|%s","Centre"))
mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))

# run ANOVA
res = anova(mod2use)
tres = summary(mod2use)
t_disc = tres$tTable["z_ds","t-value"]
res
##             numDF denDF  F-value p-value
## (Intercept)     1   117 576.3867  <.0001
## z_ds            1   117   4.4529   0.037
cor.test(df4mod[,y_var],df4mod$z_ds)
## 
##  Pearson's product-moment correlation
## 
## data:  df4mod[, y_var] and df4mod$z_ds
## t = -1.6818, df = 120, p-value = 0.0952
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.32085672  0.02673098
## sample estimates:
##       cor 
## -0.151751
# Replication
df4mod = subset(data2write, data2write$dataset=="Replication")
df4mod = merge(df4mod,tmp_test[,c("subid","z_ds")],by.x="subid",by.y="subid")
df4mod[df4mod[,y_var]==999,y_var] = NA
# df4mod[df4mod[,y_var]==777,y_var] = NA
n_rep = sum(!is.na(df4mod[,y_var]))

# construct linear model
# mixed-effect model: site as random factor, all other covariates as fixed factors
fx_form = as.formula(sprintf("%s ~ %s",y_var,"z_ds"))
rx_form = as.formula(sprintf("~ 1|%s","Centre"))
mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))

# run ANOVA
res = anova(mod2use)
tres = summary(mod2use)
t_rep = tres$tTable["z_ds","t-value"]
res
##             numDF denDF  F-value p-value
## (Intercept)     1   115 544.8477  <.0001
## z_ds            1   115  25.1670  <.0001
cor.test(df4mod[,y_var],df4mod$z_ds)
## 
##  Pearson's product-moment correlation
## 
## data:  df4mod[, y_var] and df4mod$z_ds
## t = -4.6877, df = 118, p-value = 7.474e-06
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.5373047 -0.2335690
## sample estimates:
##        cor 
## -0.3962224
# replication Bayes Factor
res_bf = BFSALL(tobs =t_disc,
                      trep = t_rep,
                      n1 = n_disc,
                      n2 = n_rep,
                      sample = 1,
                      Type = 'ALL')
res_bf["Replication BF","Replication 1"]
## [1] 8931.714
# plot scatterplot of z_ds by Vineland ABC
df4plot = subset(data2write)
df4plot = merge(df4plot,tmp_test[,c("subid","z_ds")],by.x="subid",by.y="subid")
df4plot[df4plot[,y_var]==999,y_var] = NA
# df4plot[df4plot[,y_var]==777,y_var] = NA

p = ggplot(data=df4plot, aes(x = z_ds, y = vabsabcabc_standard, colour = dataset)) + facet_grid(. ~ dataset)
p = p + geom_point(size=3) + geom_smooth(method=lm) + xlab("Z SC-RRB") + ylab("Vineland ABC") +
  scale_colour_manual(values = c("dodger blue","#ff8d1e")) + guides(colour=FALSE) +
  theme(text = element_text(size=fontSize),
        axis.text.x = element_text(size=fontSize),
        axis.text.y = element_text(size=fontSize))
p

Vineland Daily Living Skills

# plot Vineland Daily Living Skills effect sizes over thresholds
tmp = data.frame(vabc_dls)
tmp$threshold = factor(rownames(tmp))

df2plot = melt(tmp)
p = ggplot(data = df2plot, aes(x = threshold, y =value, color=variable, group=variable)) + facet_grid(. ~ variable)
p = p + geom_line(size=4) +
  geom_point(size=7) +
  ylab("Cohen's d") +
  xlab("Z-threshold") +
  ylim(-0.1,1) + scale_colour_manual(values = c("dodger blue","#ff8d1e")) +
  guides(color=FALSE) +
  theme(text = element_text(size=fontSize),
        axis.text.x = element_text(size=fontSize),
        axis.text.y = element_text(size=fontSize))
p

#------------------------------------------------------------------------------
# model effect of z difference score on Vineland ABC
y_var = "vabsdscoresd_dss"

# Discovery
df4mod = subset(data2write, data2write$dataset=="Discovery")
df4mod = merge(df4mod,tmp_test[,c("subid","z_ds")],by.x="subid",by.y="subid")
df4mod[df4mod[,y_var]==999,y_var] = NA
# df4mod[df4mod[,y_var]==777,y_var] = NA
n_disc = sum(!is.na(df4mod[,y_var]))

# construct linear model
# mixed-effect model: site as random factor, all other covariates as fixed factors
fx_form = as.formula(sprintf("%s ~ %s",y_var,"z_ds"))
rx_form = as.formula(sprintf("~ 1|%s","Centre"))
mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))

# run ANOVA
res = anova(mod2use)
tres = summary(mod2use)
t_disc = tres$tTable["z_ds","t-value"]
res
##             numDF denDF  F-value p-value
## (Intercept)     1   117 392.4521  <.0001
## z_ds            1   117   1.0061  0.3179
cor.test(df4mod[,y_var],df4mod$z_ds)
## 
##  Pearson's product-moment correlation
## 
## data:  df4mod[, y_var] and df4mod$z_ds
## t = -0.5396, df = 120, p-value = 0.5905
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.2249921  0.1296964
## sample estimates:
##         cor 
## -0.04919883
# Replication
df4mod = subset(data2write, data2write$dataset=="Replication")
df4mod = merge(df4mod,tmp_test[,c("subid","z_ds")],by.x="subid",by.y="subid")
df4mod[df4mod[,y_var]==999,y_var] = NA
# df4mod[df4mod[,y_var]==777,y_var] = NA
n_rep = sum(!is.na(df4mod[,y_var]))

# construct linear model
# mixed-effect model: site as random factor, all other covariates as fixed factors
fx_form = as.formula(sprintf("%s ~ %s",y_var,"z_ds"))
rx_form = as.formula(sprintf("~ 1|%s","Centre"))
mod2use = eval(substitute(lme(fixed = fx_form, random = rx_form, data = df4mod, na.action = na.omit)))

# run ANOVA
res = anova(mod2use)
tres = summary(mod2use)
t_rep = tres$tTable["z_ds","t-value"]
res
##             numDF denDF  F-value p-value
## (Intercept)     1   115 326.2800  <.0001
## z_ds            1   115  13.6755   3e-04
cor.test(df4mod[,y_var],df4mod$z_ds)
## 
##  Pearson's product-moment correlation
## 
## data:  df4mod[, y_var] and df4mod$z_ds
## t = -3.4038, df = 118, p-value = 0.0009085
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.4539225 -0.1265501
## sample estimates:
##       cor 
## -0.299009
# replication Bayes Factor
res_bf = BFSALL(tobs =t_disc,
                      trep = t_rep,
                      n1 = n_disc,
                      n2 = n_rep,
                      sample = 1,
                      Type = 'ALL')
res_bf["Replication BF","Replication 1"]
## [1] 79.65693
# plot scatterplot of z_ds by Vineland ABC
df4plot = subset(data2write)
df4plot = merge(df4plot,tmp_test[,c("subid","z_ds")],by.x="subid",by.y="subid")
df4plot[df4plot[,y_var]==999,y_var] = NA
# df4plot[df4plot[,y_var]==777,y_var] = NA

p = ggplot(data=df4plot, aes(x = z_ds, y = vabsdscoresd_dss, colour = dataset)) + facet_grid(. ~ dataset)
p = p + geom_point(size=3) + geom_smooth(method=lm) + xlab("Z SC-RRB") + ylab("Vineland Daily Living Skills") +
  scale_colour_manual(values = c("dodger blue","#ff8d1e")) + guides(colour=FALSE) +
  theme(text = element_text(size=fontSize),
        axis.text.x = element_text(size=fontSize),
        axis.text.y = element_text(size=fontSize))
p